From 33c62ead792a996d483ee960bc056d1377fac682 Mon Sep 17 00:00:00 2001 From: JJ Date: Thu, 13 Apr 2023 15:14:11 -0700 Subject: initial draft of a new, actually decent lexer --- src/parser.rs | 214 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 214 insertions(+) (limited to 'src') diff --git a/src/parser.rs b/src/parser.rs index 95807c1..d3895a4 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,4 +1,5 @@ use crate::ast::*; +use multipeek::multipeek; /// Parses a lambda-calculus-like language into an AST. pub fn parse_lambda(input: &str) -> Result> { @@ -100,3 +101,216 @@ pub fn parse_lambda(input: &str) -> Result', '@', '$', '~', '&', '%', '|', '!', '?', '^', '\\']; +const brackets: [char; 6] = ['(', ')', '{', '}', '[', ']']; +const special: [char; 7] = ['.', ',', ':', ';', '`', '\'', '"']; +const keywords: [&'static str; 3] = ["if", "else", "func"]; + +pub enum Token { + Operator(String), + Keyword(String), + Separator(String), + Identifier(String), + Value(String), + Char(char), + String(String), + Comment(String), + Token(String), // catch-all + ScopeBegin, // { + ScopeEnd, // } + ExprEnd, // ; +} + +/// Properly lexes a whitespace-oriented language into a series of tokens. +pub fn lex(input: &str) -> Result, &'static str> { + enum State { + Default, + Char, + String, + MultiLineString, + Comment, + } + struct Indentation { + blank: bool, // is the line entirely whitespace so far? + level: usize, // current indentation level + count: usize, // current whitespace count + } + + let mut state = State::Default; + let mut indent = Indentation { blank: true, level: 0, count: 0 }; + let mut buffer = String::new(); + let mut result = Vec::new(); + + let mut input = multipeek(input.chars()); // multipeek my beloved + while let Some(c) = input.next() { + match state { + State::Default => match c { + ' ' if indent.blank => indent.count += 1, + ' ' if buffer.len() > 0 => { + result.push(parse_token(&buffer)?); + buffer.clear(); + }, + ' ' => todo!(), + '\n' => todo!(), + '\t' => return Err("Tabs are not supported!"), + '\'' => { + result.push(parse_token(&buffer)?); + buffer.clear(); + if input.peek_nth(0) == Some(&'\\') || input.peek_nth(1) == Some(&'\'') { + state = State::Char; + } else { + result.push(Token::Separator("'".to_string())); + } + }, + '"' => { + if input.peek_nth(0) == Some(&'\"') && input.peek_nth(1) == Some(&'\"') { + state = State::MultiLineString; + input.next(); + input.next(); + } else { + state = State::String; + } + }, + '#' => { + state = State::Comment; + result.push(parse_token(&buffer)?); + buffer.clear(); + }, + _ if brackets.contains(&c) || special.contains(&c) => { + if buffer.len() > 0 { + result.push(parse_token(&buffer)?); + buffer.clear(); + } + result.push(Token::Separator(c.to_string())); + if indent.blank { + indent.blank = false; + } + } + _ if indent.blank => { + indent.blank = false; + // indentation check + todo!(); + buffer.push(c); + } + _ => buffer.push(c) + }, + State::Char => match c { + '\\' => { + match input.next() { + Some('\\') => result.push(Token::Char('\\')), + Some('0') => result.push(Token::Char('\0')), + Some('n') => result.push(Token::Char('\n')), + Some('r') => result.push(Token::Char('\r')), + Some('t') => result.push(Token::Char('\t')), + Some('\"') => result.push(Token::Char('\"')), + Some('\'') => result.push(Token::Char('\'')), + _ => return Err("Invalid string escape sequence!"), + } + state = State::Default; + if input.next() != Some('\'') { + return Err("Invalid character sequence!") + } + }, + '\'' => { + result.push(Token::Char('\0')); + state = State::Default; + } + _ => { + result.push(Token::Char(c)); + state = State::Default; + if input.next() != Some('\'') { + return Err("Invalid character sequence!") + } + } + }, + State::String => match c { + '\\' => match input.next() { + Some('\\') => buffer.push('\\'), + Some('0') => buffer.push('\0'), + Some('n') => buffer.push('\n'), + Some('r') => buffer.push('\r'), + Some('t') => buffer.push('\t'), + Some('\"') => buffer.push('\"'), + Some('\'') => buffer.push('\''), + _ => return Err("Invalid string escape sequence!"), + }, + '\"' => { + state = State::Default; + result.push(Token::String(buffer.to_string())); + buffer.clear(); + } + _ => buffer.push(c) + }, + State::MultiLineString => match c { + '\"' if input.peek_nth(0) == Some(&'"') && input.peek_nth(1) == Some(&'"') => { + state = State::Default; + result.push(Token::String(buffer.to_string())); + buffer.clear(); + input.next(); + input.next(); + }, + _ => buffer.push(c) + }, + State::Comment => match c { + '\n' => { + state = State::Default; + result.push(Token::Comment(buffer.to_string())); + }, + _ => buffer.push(c) + }, + } + } + return Ok(result); +} + +fn parse_token(token: &str) -> Result { + if keywords.contains(&token) { + Ok(Token::Keyword(token.to_string())) + } else if is_operator(token) { + Ok(Token::Operator(token.to_string())) + } else if is_value(token) { + Ok(Token::Value(token.to_string())) + } else if is_identifier(token) { + Ok(Token::Identifier(token.to_string())) + } else { + Err("Could not parse token!") + } +} + +fn is_operator(token: &str) -> bool { + for c in token.chars() { + if !operators.contains(&c) { + return false; + } + } + return true; +} + +fn is_value(token: &str) -> bool { + if token == "true" || token == "false" { + return true; + } + // fixme: hex literals etc + for c in token.chars() { + // note size annotations are separately lexed + if !c.is_numeric() { + return false; + } + } + return true; +} + +fn is_identifier(token: &str) -> bool { + if let Some(c) = token.chars().nth(0) { + if c.is_numeric() || c == '_' { + return false; + } + } + for c in token.chars() { + if !c.is_alphanumeric() && c != '_' { + return false; + } + } + return true; +} -- cgit v1.2.3-70-g09d2