From 3a25fa06d2cd9d2b89b68622415d7168cf06c0da Mon Sep 17 00:00:00 2001
From: JJ
Date: Thu, 26 Oct 2023 03:34:03 -0700
Subject: compiler: unicode support, reconsider indentation handling

---
 src/lex.rs | 101 +++++++++++++++++++++----------------------------------------
 1 file changed, 35 insertions(+), 66 deletions(-)
diff --git a/src/lex.rs b/src/lex.rs
index d3d5a7e..7e31476 100644
--- a/src/lex.rs
+++ b/src/lex.rs
@@ -8,6 +8,7 @@ pub enum LexicalError {
     InvalidIndentation,
     MismatchedParens,
     MismatchedBrackets,
+    UnknownPunctuation,
 }
 
 impl std::fmt::Display for LexicalError {
@@ -24,7 +25,7 @@ pub enum Token {
     Num(String),    // numeric value, ex. 413, 0b101011, 0xabcd
     Lit(Literal),   // literal value, ex. for strings/comments.
     Sep(Punctuation),   // punctuation. non-word tokens. operators are lexed as this and later transformed to words.
-    Begin, End, Newline // scope indicators. can i use trees instead? should i use trees instead?
+    Indent(usize),      // indentation. denotes line breaks and scope at which a line starts.
 }
 
 #[derive(Clone, PartialEq)]
@@ -93,16 +94,12 @@ pub fn tokenize(input: &str) -> Result<TokenStream> {
     enum Bracket { Generic, Array }
     struct State {
         start_of_line: bool,
-        indent_level: isize,
-        indent_width: isize,
         paren_stack: Vec<Paren>,
         bracket_stack: Vec<Bracket>,
     }
 
     let mut state = State {
         start_of_line: true,
-        indent_level: 0,
-        indent_width: 0,
         paren_stack: vec!(),
         bracket_stack: vec!(),
     };
@@ -114,66 +111,35 @@ pub fn tokenize(input: &str) -> Result<TokenStream> {
     let mut input = multipeek(input.chars());
     while let Some(c) = input.next() {
         match c {
-            ' ' => {
-                if state.start_of_line { // indentation
-                    let mut current_indent_level = 1;
-                    while let Some(x) = input.peek() {
-                        match x {
-                            ' ' => current_indent_level += 1,
-                            '\n' => break, // empty line
-                            _ => { // indentation ends
-                                // really gross. this just checks if the previous token was a newline,
-                                // and that the token before it was punctuation or a known "operator",
-                                // and if so disregards indentation and treats it as a line continuation.
-                                if let Some(Newline) = res.get(res.len() - 1) {
-                                    if let Some(prev) = res.get(res.len() - 2) {
-                                        match prev { // all keywords and punctuation that may continue a line
-                                            // workaround for https://github.com/rust-lang/rust/issues/87121
-                                            Word(a) if a == "==" || a == "and" || a == "or" ||
-                                                       a == "xor" || a == "in" || a == "is" => {
-                                                res.pop();
-                                                break;
-                                            },
-                                            &Sep(FuncLeftParen) | Sep(GenericLeftBracket) | Sep(StructLeftBrace) |
-                                            Sep(TupleLeftParen) | Sep(ArrayLeftBracket) | Sep(Comma) => {
-                                                res.pop();
-                                                break;
-                                            }
-                                            _ => {}
-                                        }
+            ' ' => { // indentation! and whitespace
+                match res.last() {
+                    Some(Indent(_)) => { // indentation!
+                        res.pop(); // discard previous empty or useless Indent token
+                        let mut current_indent_level = 1;
+                        while let Some(x) = input.peek() {
+                            match x {
+                                ' ' => current_indent_level += 1,
+                                _ => match res.last() { // indentation ends
+                                    Some(Word(a)) if a == "==" || a == "and" || a == "or" ||
+                                                     a == "xor" || a == "in" || a == "is" => break,
+                                    Some(Sep(FuncLeftParen)) | Some(Sep(TupleLeftParen)) |
+                                    Some(Sep(GenericLeftBracket)) | Some(Sep(ArrayLeftBracket)) |
+                                    Some(Sep(StructLeftBrace)) | Some(Sep(Comma)) => break,
+                                    _ => {
+                                        res.push(Indent(current_indent_level));
+                                        break;
                                     }
                                 }
-
-                                // will only fire once. allows us to support X number of spaces so long as it's consistent
-                                if state.indent_width == 0 {
-                                    state.indent_width = current_indent_level;
-                                }
-
-                                if current_indent_level % state.indent_width != 0 {
-                                    return Err(InvalidIndentation.into());
-                                }
-
-                                let diff = (current_indent_level - state.indent_level) / state.indent_width;
-                                match diff {
-                                    0 => (),                // same level of indentation
-                                    1 => res.push(Begin),   // new level of indentation
-                                    -1 => res.push(End),    // old level of indentation
-                                    _ => return Err(InvalidIndentation.into()) // todo: support indentation in exprs
-                                }
-                                state.indent_level = current_indent_level;
-                                break;
                             }
                         }
+                    },
+                    _ => { // get rid of excess (all) whitespace between words/operators
+                        while input.peek().is_some_and(|x| x.is_whitespace() && x != &'\n') { input.next(); }
                     }
-                } else { // get rid of excess (all) whitespace
-                    while input.peek() == Some(&' ') { input.next(); }
                 }
             },
             '\t' => return Err(InvalidIndentation.into()),
-            '\n' => {
-                state.start_of_line = true;
-                res.push(Newline)
-            },
+            '\n' => res.push(Indent(0)),
             '\'' => { // chars!
                 while let Some(x) = input.next() {
                     match x {
@@ -190,8 +156,8 @@ pub fn tokenize(input: &str) -> Result<TokenStream> {
                         input.next(); input.next();
                         while let Some(x) = input.next() {
                             match x {
-                                '"' if input.peek_nth(1) == Some(&'"') &&
-                                       input.peek_nth(2) == Some(&'"') => {
+                                '"' if input.peek_nth(0) == Some(&'"') &&
+                                       input.peek_nth(1) == Some(&'"') => {
                                     input.next(); input.next();
                                     break;
                                },
@@ -257,13 +223,11 @@ pub fn tokenize(input: &str) -> Result<TokenStream> {
                     }
                 }
             },
-            'a'..='z' | 'A'..='Z' | '_' => { // valid identifiers!
-                buf.push(c); // todo: unicode support
+            c if c.is_alphabetic() || c == '_' => { // valid identifiers!
+                buf.push(c);
                 while let Some(x) = input.next() {
                     match x {
-                        'a'..='z' | 'A'..='Z' | '0'..='9' | '_' => {
-                            buf.push(x);
-                        },
+                        x if x.is_alphanumeric() || x == '_' => buf.push(x),
                         _ => {
                             res.push(Word(String::from(&buf)));
                             match x { // () and [] denote both parameters/generics and tuples/arrays
@@ -285,7 +249,7 @@ pub fn tokenize(input: &str) -> Result<TokenStream> {
             '0'..='9' => { // numeric literals!
                 buf.push(c);
                 while let Some(x) = input.next() {
-                    match x { // todo: unicode support
+                    match x {
                         'a'..='z' | 'A'..='Z' | '0'..='9' | '_' => {
                             buf.push(x);
                             input.next();
@@ -322,6 +286,11 @@ pub fn tokenize(input: &str) -> Result<TokenStream> {
                     Some(Bracket::Array) => res.push(Sep(ArrayRightBracket)),
                     None => return Err(MismatchedBrackets.into()),
                 }
+                if input.peek() == Some(&'[') { // parameters following generics
+                    res.push(Sep(FuncLeftParen));
+                    state.paren_stack.push(Paren::Func);
+                    input.next();
+                }
             },
             ',' => res.push(Sep(Comma)),
             '.' => res.push(Sep(Period)),
@@ -345,7 +314,7 @@ pub fn tokenize(input: &str) -> Result<TokenStream> {
             '?' => res.push(Sep(Question)),
             '^' => res.push(Sep(Caret)),
             '\\' => res.push(Sep(Backslash)),
-            _ => return Err("unknown character".into()) // todo: support unicode!
+            _ => return Err(UnknownPunctuation.into())
         }
         buf.clear();
     }
-- 
cgit v1.2.3-70-g09d2