compiler: lex scope, throw errors

author: JJ 2023-07-19 03:10:53 +0000
committer: JJ 2023-07-19 03:26:12 +0000
commit: 3b4cf68650041e1c4be43d7eba58879e37b2c760 (patch)
tree: 1d563e821d6ceedd6bfd70802cd77d1201e3c954 /src
parent: f3b8cdaf74dac7533894222d6c1239cef898831b (diff)
2 files changed, 62 insertions, 14 deletions
diff --git a/src/lex.rs b/src/lex.rs
index bfd74ae..8999aea 100644
--- a/src/lex.rs
+++ b/src/lex.rs
@@ -1,21 +1,34 @@
 use multipeek::multipeek;
 
-/// **Basic** syntax tokens.
+pub type Result<T> = core::result::Result<T, Box<dyn std::error::Error>>;
+pub type TokenStream = Vec<Token>;
+
+/// **Basic** syntax tokens. Form an unambiguous TokenStream.
+#[derive(Clone, PartialEq)]
 pub enum Token {
     Word(String),   // identifiers.
     Lit(String),    // literal value, ex. for strings/comments.
     Sep(char),      // punctuation. non-word tokens.
-    Ind(usize)      // indentation.
+    Begin, End      // scope indicators.
 }
 
-/// Lexes a file into a Vec of fundamental Tokens.
-pub fn tokenize(input: &str) -> Vec<Token> {
+/// All keywords that may continue a line. For knowing valid line splits.
+const valid_continuations: [&str; 7] = ["and", "or", "xor", "in", "notin", "is", "isnot"];
+
+/// Parses whitespace-sensitive code into an unambiguous TokenStream.
+/// Also useful for formatting.
+// todo: support indentation within expressions
+// nim: "As a rule of thumb, indentation within expressions is
+// allowed after operators, an open parenthesis and after commas."
+pub fn tokenize(input: &str) -> Result<TokenStream> {
     // The design of this lexer utilizes to great extent multipeek's arbitrary peeking.
     // Tokens are matched by looping within their case until complete.
     // This then eliminates the need for almost all global parser state.
 
     use Token::*;
     let mut start_of_line = true;   // state
+    let mut indent_level = 0;       // state
+    let mut indent_width = None;    // state
     let mut buf = String::new();    // buffer
     let mut res = Vec::new();       // result
 
@@ -24,19 +37,53 @@ pub fn tokenize(input: &str) -> Vec<Token> {
     while let Some(c) = input.next() {
         match c {
             ' ' => {
-                if start_of_line { // indentation, to be dealt with later
-                    let mut indendation_level = 1;
+                if start_of_line { // indentation
+                    let mut current_indent_level = 1;
                     while let Some(x) = input.peek() {
                         match x {
-                            ' ' => indendation_level += 1,
+                            ' ' => current_indent_level += 1,
                             '\n' => break, // empty line
-                            _ => {
-                                res.push(Ind(indendation_level));
+                            _ => { // indentation ends
+                                // really gross. this just checks if the previous token was a newline,
+                                // and that the token before it was punctuation or a known "operator",
+                                // and if so disregards indentation and treats it as a line continuation.
+                                if let Some(&Sep('\n')) = res.get(res.len() - 1) {
+                                    if let Some(y) = res.get(res.len() - 2) {
+                                        if let Word(z) = y {
+                                            if valid_continuations.contains(&&z[..]) {
+                                                res.pop();
+                                                break;
+                                            }
+                                        } else if let Sep(_) = y {
+                                            res.pop();
+                                            break;
+                                        }
+                                    }
+                                }
+
+                                // will only fire once. allows us to support X number of spaces so long as it's consistent
+                                if indent_width.is_none() {
+                                    indent_width = Some(current_indent_level);
+                                }
+
+                                let indent_width = indent_width.unwrap(); // safe. see above
+                                if current_indent_level % indent_width != 0 {
+                                    return Err("indentation is offset".into());
+                                }
+
+                                let diff = (current_indent_level as isize - indent_level as isize) / indent_width as isize;
+                                match diff {
+                                    0 => (),                // same level of indentation
+                                    1 => res.push(Begin),   // new level of indentation
+                                    -1 => res.push(End),    // old level of indentation
+                                    _ => return Err("indentation stepped by too much in one go".into())
+                                }
+                                indent_level = current_indent_level;
                                 break;
                             }
                         }
                     }
-                } else { // get rid of whitespace
+                } else { // get rid of excess (all) whitespace
                     while input.peek() == Some(&' ') { input.next(); }
                 }
             },
@@ -44,7 +91,7 @@ pub fn tokenize(input: &str) -> Vec<Token> {
                 start_of_line = true;
                 res.push(Sep('\n'))
             },
-            c if c.is_whitespace() => (), // tabs etc are not supported
+            c if c.is_whitespace() => return Err("tabs etc are not supported".into()),
             '\'' => { // single quoted strings, i.e. chars
                 res.push(Sep('\''));
                 while let Some(x) = input.next() {
@@ -133,11 +180,11 @@ pub fn tokenize(input: &str) -> Vec<Token> {
                 }
                 res.push(Word(String::from(&buf)));
             },
-            '.' | ',' | ':' | ';' |
-            '(' | ')' | '[' | ']' | '{' | '}' => res.push(Sep(c)),  // Sep
+            '.' | ',' | ':' | ';' | // punctuation
+            '(' | ')' | '[' | ']' | '{' | '}' => res.push(Sep(c)),
             _ => res.push(Sep(c))   // for now: treat unknown chars as Sep
         }
         buf.clear();
     }
-    return res;
+    return Ok(res);
 }
diff --git a/src/main.rs b/src/main.rs
index e61c2ec..c366367 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,3 +1,4 @@
+#![allow(non_upper_case_globals)]
 #![feature(exclusive_range_pattern, let_chains)]
 
 mod lex;
author	JJ	2023-07-19 03:10:53 +0000
committer	JJ	2023-07-19 03:26:12 +0000
commit	3b4cf68650041e1c4be43d7eba58879e37b2c760 (patch)
tree	1d563e821d6ceedd6bfd70802cd77d1201e3c954 /src
parent	f3b8cdaf74dac7533894222d6c1239cef898831b (diff)