compiler: complete basic lexer

author: JJ 2023-07-18 22:45:44 +0000
committer: JJ 2023-07-18 22:45:44 +0000
commit: f3b8cdaf74dac7533894222d6c1239cef898831b (patch)
tree: 5f6953b05ed6954ccf7293418a1262a434cf5e52 /src
parent: fbfeffad5d85b73102a0f4e5ed607c33319f09e6 (diff)
1 files changed, 95 insertions, 73 deletions
diff --git a/src/lex.rs b/src/lex.rs
index 7c0cc21..bfd74ae 100644
--- a/src/lex.rs
+++ b/src/lex.rs
@@ -1,121 +1,143 @@
-use std::todo;
-use crate::tree::Tree;
 use multipeek::multipeek;
 
 /// **Basic** syntax tokens.
 pub enum Token {
-    Word(String),
-    Lit(String),
-    Sep(char)
+    Word(String),   // identifiers.
+    Lit(String),    // literal value, ex. for strings/comments.
+    Sep(char),      // punctuation. non-word tokens.
+    Ind(usize)      // indentation.
 }
 
-/// Parses whitespace-sensitive code into an unambiguous syntax tree.
-/// Also useful for formatting.
-pub fn tokenize(input: &str) -> Tree<Vec<Token>> {
+/// Lexes a file into a Vec of fundamental Tokens.
+pub fn tokenize(input: &str) -> Vec<Token> {
+    // The design of this lexer utilizes to great extent multipeek's arbitrary peeking.
+    // Tokens are matched by looping within their case until complete.
+    // This then eliminates the need for almost all global parser state.
+
     use Token::*;
-    let mut indendation_level = 0;
-    let mut buffer = String::new();
-    let mut result = Tree::new(Vec::new());
-    let ctx = result.data(result.root());
+    let mut start_of_line = true;   // state
+    let mut buf = String::new();    // buffer
+    let mut res = Vec::new();       // result
 
     // `char` in rust is four bytes it's fine
     let mut input = multipeek(input.chars());
     while let Some(c) = input.next() {
         match c {
-            ' ' => todo!(),
-            '\n' => todo!(),
-            '\t' => todo!(),
-            '\'' => {
-                ctx.push(Sep('\''));
+            ' ' => {
+                if start_of_line { // indentation, to be dealt with later
+                    let mut indendation_level = 1;
+                    while let Some(x) = input.peek() {
+                        match x {
+                            ' ' => indendation_level += 1,
+                            '\n' => break, // empty line
+                            _ => {
+                                res.push(Ind(indendation_level));
+                                break;
+                            }
+                        }
+                    }
+                } else { // get rid of whitespace
+                    while input.peek() == Some(&' ') { input.next(); }
+                }
+            },
+            '\n' => { // newlines are separators
+                start_of_line = true;
+                res.push(Sep('\n'))
+            },
+            c if c.is_whitespace() => (), // tabs etc are not supported
+            '\'' => { // single quoted strings, i.e. chars
+                res.push(Sep('\''));
                 while let Some(x) = input.next() {
-                    if x == '\\' {
-                        buffer.push(x);
-                        continue;
-                    } else if x == '\'' {
-                        break;
-                    } else {
-                        buffer.push(x);
+                    match x {
+                        '\'' => break,
+                        '\\' => if let Some(y) = input.next() { buf.push(y) },
+                        _ => buf.push(x)
                     }
                 }
-                ctx.push(Lit(String::from(&buffer)));
-                ctx.push(Sep('\''));
+                res.push(Lit(String::from(&buf)));
+                res.push(Sep('\''));
             },
-            '"' => { // triple quoted strings and regular strings
+            '"' => { // triple quoted strings
                 if input.peek_nth(0) == Some(&'"') &&
                    input.peek_nth(1) == Some(&'"') {
                     input.next(); input.next();
-                    ctx.push(Sep('"')); ctx.push(Sep('"')); ctx.push(Sep('"'));
-                    while let Some(c) = input.next() {
-                        if c == '"' &&
-                           input.peek_nth(1) == Some(&'"') &&
-                           input.peek_nth(2) == Some(&'"') {
-                            input.next(); input.next();
-                            break;
+                    res.push(Sep('"')); res.push(Sep('"')); res.push(Sep('"'));
+                    while let Some(x) = input.next() {
+                        match x {
+                            '"' if input.peek_nth(1) == Some(&'"') &&
+                                   input.peek_nth(2) == Some(&'"') => {
+                                break;
+                           },
+                           _ => buf.push(x)
                         }
-                        buffer.push(c);
                     }
-                    ctx.push(Lit(String::from(&buffer)));
-                    ctx.push(Sep('"')); ctx.push(Sep('"')); ctx.push(Sep('"'));
-                } else {
-                    ctx.push(Sep('"'));
+                    res.push(Lit(String::from(&buf)));
+                    input.next(); input.next();
+                    res.push(Sep('"')); res.push(Sep('"')); res.push(Sep('"'));
+                } else { // regular strings
+                    res.push(Sep('"'));
                     while let Some(x) = input.next() {
-                        if x == '\\' {
-                            buffer.push(x);
-                            continue;
-                        } else if x == '"' {
-                            break;
+                        match x {
+                            '"' => break,
+                            '\\' => if let Some(y) = input.next() { buf.push(y) },
+                            _ => buf.push(x)
                         }
-                        buffer.push(c);
                     }
-                    ctx.push(Lit(String::from(&buffer)));
-                    ctx.push(Sep('"'));
+                    res.push(Lit(String::from(&buf)));
+                    res.push(Sep('"'));
                 }
             },
-            '#' => {
+            '#' => { // block comment, can be nested
                 if input.peek() == Some(&'[') {
                     input.next();
-                    ctx.push(Sep('#')); ctx.push(Sep('['));
+                    res.push(Sep('#')); res.push(Sep('['));
                     let mut comment_level = 1;
                     while let Some(x) = input.next() && comment_level > 0 {
-                        if x == '#' && input.peek() == Some(&'[') {
-                            comment_level += 1;
-                            input.next();
-                        } else if x == ']' && input.peek() == Some(&'#') {
-                            comment_level -= 1;
-                            input.next();
-                        } else {
-                            buffer.push(x);
+                        match x {
+                            '#' if input.peek() == Some(&'[') => {
+                                comment_level += 1;
+                                input.next();
+                            },
+                            ']' if input.peek() == Some(&'#') => {
+                                comment_level -= 1;
+                                input.next();
+                            },
+                            _ => buf.push(x)
                         }
                     }
-                    ctx.push(Lit(String::from(&buffer)));
-                    ctx.push(Sep(']')); ctx.push(Sep('#'));
-                } else {
-                    ctx.push(Sep('#'));
-                    while let Some(x) = input.next() && x != '\n' {
-                        buffer.push(x);
+                    res.push(Lit(String::from(&buf)));
+                    res.push(Sep(']')); res.push(Sep('#'));
+                } else { // standard comment, runs until eol
+                    res.push(Sep('#'));
+                    while let Some(x) = input.peek() {
+                        match x {
+                            '\n' => break,
+                            _ => {
+                                buf.push(*x);
+                                input.next();
+                            }
+                        }
                     }
-                    ctx.push(Lit(String::from(&buffer)));
+                    res.push(Lit(String::from(&buf)));
                 }
             },
-            'a'..'z' | 'A'..'Z' | '0'..'9' | '_' => {
+            'a'..'z' | 'A'..'Z' | '0'..'9' | '_' => { // valid identifier
                 while let Some(x) = input.peek() {
                     match x {
                         'a'..'z' | 'A'..'Z' | '0'..'9' | '_' => {
-                            buffer.push(c);
+                            buf.push(*x);
                             input.next();
                         },
                         _ => break
                     }
                 }
-                ctx.push(Word(String::from(&buffer)));
+                res.push(Word(String::from(&buf)));
             },
             '.' | ',' | ':' | ';' |
-            '(' | ')' | '[' | ']' | '{' | '}' => ctx.push(Sep(c)),
-            _ => ctx.push(Sep(c))
+            '(' | ')' | '[' | ']' | '{' | '}' => res.push(Sep(c)),  // Sep
+            _ => res.push(Sep(c))   // for now: treat unknown chars as Sep
         }
-        buffer.clear();
+        buf.clear();
     }
-    return result;
+    return res;
 }
-
-// note: we can't have a TokenStream because there is significant whitespace. so, we construct the tree structure here, although we don't do anything fancy with the tokens.
author	JJ	2023-07-18 22:45:44 +0000
committer	JJ	2023-07-18 22:45:44 +0000
commit	f3b8cdaf74dac7533894222d6c1239cef898831b (patch)
tree	5f6953b05ed6954ccf7293418a1262a434cf5e52 /src
parent	fbfeffad5d85b73102a0f4e5ed607c33319f09e6 (diff)