From 33c62ead792a996d483ee960bc056d1377fac682 Mon Sep 17 00:00:00 2001
From: JJ
Date: Thu, 13 Apr 2023 15:14:11 -0700
Subject: initial draft of a new, actually decent lexer

---
 src/parser.rs | 214 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 214 insertions(+)

(limited to 'src')
diff --git a/src/parser.rs b/src/parser.rs
index 95807c1..d3895a4 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -1,4 +1,5 @@
 use crate::ast::*;
+use multipeek::multipeek;
 
 /// Parses a lambda-calculus-like language into an AST.
 pub fn parse_lambda(input: &str) -> Result<Expression, peg::error::ParseError<peg::str::LineCol>> {
@@ -100,3 +101,216 @@ pub fn parse_lambda(input: &str) -> Result<Expression, peg::error::ParseError<pe
     return lambda::expr(input.trim());
 }
 
+const operators: [char; 17] =
+    ['=', '+', '-', '*', '/', '<', '>', '@', '$', '~', '&', '%', '|', '!', '?', '^', '\\'];
+const brackets: [char; 6] = ['(', ')', '{', '}', '[', ']'];
+const special: [char; 7] = ['.', ',', ':', ';', '`', '\'', '"'];
+const keywords: [&'static str; 3] = ["if", "else", "func"];
+
+pub enum Token {
+    Operator(String),
+    Keyword(String),
+    Separator(String),
+    Identifier(String),
+    Value(String),
+    Char(char),
+    String(String),
+    Comment(String),
+    Token(String), // catch-all
+    ScopeBegin, // {
+    ScopeEnd,   // }
+    ExprEnd,    // ;
+}
+
+/// Properly lexes a whitespace-oriented language into a series of tokens.
+pub fn lex(input: &str) -> Result<Vec<Token>, &'static str> {
+    enum State {
+        Default,
+        Char,
+        String,
+        MultiLineString,
+        Comment,
+    }
+    struct Indentation {
+        blank: bool,    // is the line entirely whitespace so far?
+        level: usize,   // current indentation level
+        count: usize,   // current whitespace count
+    }
+
+    let mut state = State::Default;
+    let mut indent = Indentation { blank: true, level: 0, count: 0 };
+    let mut buffer = String::new();
+    let mut result = Vec::new();
+
+    let mut input = multipeek(input.chars()); // multipeek my beloved
+    while let Some(c) = input.next() {
+        match state {
+            State::Default => match c {
+                ' ' if indent.blank => indent.count += 1,
+                ' ' if buffer.len() > 0 => {
+                    result.push(parse_token(&buffer)?);
+                    buffer.clear();
+                },
+                ' ' => todo!(),
+                '\n' => todo!(),
+                '\t' => return Err("Tabs are not supported!"),
+                '\'' => {
+                    result.push(parse_token(&buffer)?);
+                    buffer.clear();
+                    if input.peek_nth(0) == Some(&'\\') || input.peek_nth(1) == Some(&'\'') {
+                        state = State::Char;
+                    } else {
+                        result.push(Token::Separator("'".to_string()));
+                    }
+                },
+                '"' => {
+                    if input.peek_nth(0) == Some(&'\"') && input.peek_nth(1) == Some(&'\"') {
+                        state = State::MultiLineString;
+                        input.next();
+                        input.next();
+                    } else {
+                        state = State::String;
+                    }
+                },
+                '#' => {
+                    state = State::Comment;
+                    result.push(parse_token(&buffer)?);
+                    buffer.clear();
+                },
+                _ if brackets.contains(&c) || special.contains(&c) => {
+                    if buffer.len() > 0 {
+                        result.push(parse_token(&buffer)?);
+                        buffer.clear();
+                    }
+                    result.push(Token::Separator(c.to_string()));
+                    if indent.blank {
+                        indent.blank = false;
+                    }
+                }
+                _ if indent.blank => {
+                    indent.blank = false;
+                    // indentation check
+                    todo!();
+                    buffer.push(c);
+                }
+                _ => buffer.push(c)
+            },
+            State::Char => match c {
+                '\\' => {
+                    match input.next() {
+                        Some('\\') => result.push(Token::Char('\\')),
+                        Some('0') => result.push(Token::Char('\0')),
+                        Some('n') => result.push(Token::Char('\n')),
+                        Some('r') => result.push(Token::Char('\r')),
+                        Some('t') => result.push(Token::Char('\t')),
+                        Some('\"') => result.push(Token::Char('\"')),
+                        Some('\'') => result.push(Token::Char('\'')),
+                        _ => return Err("Invalid string escape sequence!"),
+                    }
+                    state = State::Default;
+                    if input.next() != Some('\'') {
+                        return Err("Invalid character sequence!")
+                    }
+                },
+                '\'' => {
+                    result.push(Token::Char('\0'));
+                    state = State::Default;
+                }
+                _ => {
+                    result.push(Token::Char(c));
+                    state = State::Default;
+                    if input.next() != Some('\'') {
+                        return Err("Invalid character sequence!")
+                    }
+                }
+            },
+            State::String => match c {
+                '\\' => match input.next() {
+                    Some('\\') => buffer.push('\\'),
+                    Some('0') => buffer.push('\0'),
+                    Some('n') => buffer.push('\n'),
+                    Some('r') => buffer.push('\r'),
+                    Some('t') => buffer.push('\t'),
+                    Some('\"') => buffer.push('\"'),
+                    Some('\'') => buffer.push('\''),
+                    _ => return Err("Invalid string escape sequence!"),
+                },
+                '\"' => {
+                    state = State::Default;
+                    result.push(Token::String(buffer.to_string()));
+                    buffer.clear();
+                }
+                _ => buffer.push(c)
+            },
+            State::MultiLineString => match c {
+                '\"' if input.peek_nth(0) == Some(&'"') && input.peek_nth(1) == Some(&'"') => {
+                    state = State::Default;
+                    result.push(Token::String(buffer.to_string()));
+                    buffer.clear();
+                    input.next();
+                    input.next();
+                },
+                _ => buffer.push(c)
+            },
+            State::Comment => match c {
+                '\n' => {
+                    state = State::Default;
+                    result.push(Token::Comment(buffer.to_string()));
+                },
+                _ => buffer.push(c)
+            },
+        }
+    }
+    return Ok(result);
+}
+
+fn parse_token(token: &str) -> Result<Token, &'static str> {
+    if keywords.contains(&token) {
+        Ok(Token::Keyword(token.to_string()))
+    } else if is_operator(token) {
+        Ok(Token::Operator(token.to_string()))
+    } else if is_value(token) {
+        Ok(Token::Value(token.to_string()))
+    } else if is_identifier(token) {
+        Ok(Token::Identifier(token.to_string()))
+    } else {
+        Err("Could not parse token!")
+    }
+}
+
+fn is_operator(token: &str) -> bool {
+    for c in token.chars() {
+        if !operators.contains(&c) {
+            return false;
+        }
+    }
+    return true;
+}
+
+fn is_value(token: &str) -> bool {
+    if token == "true" || token == "false" {
+        return true;
+    }
+    // fixme: hex literals etc
+    for c in token.chars() {
+        // note size annotations are separately lexed
+        if !c.is_numeric() {
+            return false;
+        }
+    }
+    return true;
+}
+
+fn is_identifier(token: &str) -> bool {
+    if let Some(c) = token.chars().nth(0) {
+        if c.is_numeric() || c == '_' {
+            return false;
+        }
+    }
+    for c in token.chars() {
+        if !c.is_alphanumeric() && c != '_' {
+            return false;
+        }
+    }
+    return true;
+}
-- 
cgit v1.2.3-70-g09d2