From 3a25fa06d2cd9d2b89b68622415d7168cf06c0da Mon Sep 17 00:00:00 2001 From: JJ Date: Thu, 26 Oct 2023 03:34:03 -0700 Subject: compiler: unicode support, reconsider indentation handling --- src/lex.rs | 101 +++++++++++++++++++++---------------------------------------- 1 file changed, 35 insertions(+), 66 deletions(-) diff --git a/src/lex.rs b/src/lex.rs index d3d5a7e..7e31476 100644 --- a/src/lex.rs +++ b/src/lex.rs @@ -8,6 +8,7 @@ pub enum LexicalError { InvalidIndentation, MismatchedParens, MismatchedBrackets, + UnknownPunctuation, } impl std::fmt::Display for LexicalError { @@ -24,7 +25,7 @@ pub enum Token { Num(String), // numeric value, ex. 413, 0b101011, 0xabcd Lit(Literal), // literal value, ex. for strings/comments. Sep(Punctuation), // punctuation. non-word tokens. operators are lexed as this and later transformed to words. - Begin, End, Newline // scope indicators. can i use trees instead? should i use trees instead? + Indent(usize), // indentation. denotes line breaks and scope at which a line starts. } #[derive(Clone, PartialEq)] @@ -93,16 +94,12 @@ pub fn tokenize(input: &str) -> Result { enum Bracket { Generic, Array } struct State { start_of_line: bool, - indent_level: isize, - indent_width: isize, paren_stack: Vec, bracket_stack: Vec, } let mut state = State { start_of_line: true, - indent_level: 0, - indent_width: 0, paren_stack: vec!(), bracket_stack: vec!(), }; @@ -114,66 +111,35 @@ pub fn tokenize(input: &str) -> Result { let mut input = multipeek(input.chars()); while let Some(c) = input.next() { match c { - ' ' => { - if state.start_of_line { // indentation - let mut current_indent_level = 1; - while let Some(x) = input.peek() { - match x { - ' ' => current_indent_level += 1, - '\n' => break, // empty line - _ => { // indentation ends - // really gross. this just checks if the previous token was a newline, - // and that the token before it was punctuation or a known "operator", - // and if so disregards indentation and treats it as a line continuation. - if let Some(Newline) = res.get(res.len() - 1) { - if let Some(prev) = res.get(res.len() - 2) { - match prev { // all keywords and punctuation that may continue a line - // workaround for https://github.com/rust-lang/rust/issues/87121 - Word(a) if a == "==" || a == "and" || a == "or" || - a == "xor" || a == "in" || a == "is" => { - res.pop(); - break; - }, - &Sep(FuncLeftParen) | Sep(GenericLeftBracket) | Sep(StructLeftBrace) | - Sep(TupleLeftParen) | Sep(ArrayLeftBracket) | Sep(Comma) => { - res.pop(); - break; - } - _ => {} - } + ' ' => { // indentation! and whitespace + match res.last() { + Some(Indent(_)) => { // indentation! + res.pop(); // discard previous empty or useless Indent token + let mut current_indent_level = 1; + while let Some(x) = input.peek() { + match x { + ' ' => current_indent_level += 1, + _ => match res.last() { // indentation ends + Some(Word(a)) if a == "==" || a == "and" || a == "or" || + a == "xor" || a == "in" || a == "is" => break, + Some(Sep(FuncLeftParen)) | Some(Sep(TupleLeftParen)) | + Some(Sep(GenericLeftBracket)) | Some(Sep(ArrayLeftBracket)) | + Some(Sep(StructLeftBrace)) | Some(Sep(Comma)) => break, + _ => { + res.push(Indent(current_indent_level)); + break; } } - - // will only fire once. allows us to support X number of spaces so long as it's consistent - if state.indent_width == 0 { - state.indent_width = current_indent_level; - } - - if current_indent_level % state.indent_width != 0 { - return Err(InvalidIndentation.into()); - } - - let diff = (current_indent_level - state.indent_level) / state.indent_width; - match diff { - 0 => (), // same level of indentation - 1 => res.push(Begin), // new level of indentation - -1 => res.push(End), // old level of indentation - _ => return Err(InvalidIndentation.into()) // todo: support indentation in exprs - } - state.indent_level = current_indent_level; - break; } } + }, + _ => { // get rid of excess (all) whitespace between words/operators + while input.peek().is_some_and(|x| x.is_whitespace() && x != &'\n') { input.next(); } } - } else { // get rid of excess (all) whitespace - while input.peek() == Some(&' ') { input.next(); } } }, '\t' => return Err(InvalidIndentation.into()), - '\n' => { - state.start_of_line = true; - res.push(Newline) - }, + '\n' => res.push(Indent(0)), '\'' => { // chars! while let Some(x) = input.next() { match x { @@ -190,8 +156,8 @@ pub fn tokenize(input: &str) -> Result { input.next(); input.next(); while let Some(x) = input.next() { match x { - '"' if input.peek_nth(1) == Some(&'"') && - input.peek_nth(2) == Some(&'"') => { + '"' if input.peek_nth(0) == Some(&'"') && + input.peek_nth(1) == Some(&'"') => { input.next(); input.next(); break; }, @@ -257,13 +223,11 @@ pub fn tokenize(input: &str) -> Result { } } }, - 'a'..='z' | 'A'..='Z' | '_' => { // valid identifiers! - buf.push(c); // todo: unicode support + c if c.is_alphabetic() || c == '_' => { // valid identifiers! + buf.push(c); while let Some(x) = input.next() { match x { - 'a'..='z' | 'A'..='Z' | '0'..='9' | '_' => { - buf.push(x); - }, + x if x.is_alphanumeric() || x == '_' => buf.push(x), _ => { res.push(Word(String::from(&buf))); match x { // () and [] denote both parameters/generics and tuples/arrays @@ -285,7 +249,7 @@ pub fn tokenize(input: &str) -> Result { '0'..='9' => { // numeric literals! buf.push(c); while let Some(x) = input.next() { - match x { // todo: unicode support + match x { 'a'..='z' | 'A'..='Z' | '0'..='9' | '_' => { buf.push(x); input.next(); @@ -322,6 +286,11 @@ pub fn tokenize(input: &str) -> Result { Some(Bracket::Array) => res.push(Sep(ArrayRightBracket)), None => return Err(MismatchedBrackets.into()), } + if input.peek() == Some(&'[') { // parameters following generics + res.push(Sep(FuncLeftParen)); + state.paren_stack.push(Paren::Func); + input.next(); + } }, ',' => res.push(Sep(Comma)), '.' => res.push(Sep(Period)), @@ -345,7 +314,7 @@ pub fn tokenize(input: &str) -> Result { '?' => res.push(Sep(Question)), '^' => res.push(Sep(Caret)), '\\' => res.push(Sep(Backslash)), - _ => return Err("unknown character".into()) // todo: support unicode! + _ => return Err(UnknownPunctuation.into()) } buf.clear(); } -- cgit v1.2.3-70-g09d2