From 7473fb09ed89e5e2fde8a7d6e89bc5d28151bdae Mon Sep 17 00:00:00 2001 From: JJ Date: Thu, 26 Oct 2023 01:57:33 -0700 Subject: compiler: redesign lexer. still needs unicode support & expr indentation --- src/lex.rs | 361 +++++++++++++++++++++++++++++++++++++++++++----------------- src/main.rs | 4 +- 2 files changed, 263 insertions(+), 102 deletions(-) (limited to 'src') diff --git a/src/lex.rs b/src/lex.rs index 8999aea..d3d5a7e 100644 --- a/src/lex.rs +++ b/src/lex.rs @@ -3,41 +3,119 @@ use multipeek::multipeek; pub type Result = core::result::Result>; pub type TokenStream = Vec; +#[derive(Clone, PartialEq, Debug)] +pub enum LexicalError { + InvalidIndentation, + MismatchedParens, + MismatchedBrackets, +} + +impl std::fmt::Display for LexicalError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self) + } +} +impl std::error::Error for LexicalError {} + /// **Basic** syntax tokens. Form an unambiguous TokenStream. #[derive(Clone, PartialEq)] pub enum Token { Word(String), // identifiers. - Lit(String), // literal value, ex. for strings/comments. - Sep(char), // punctuation. non-word tokens. - Begin, End // scope indicators. + Num(String), // numeric value, ex. 413, 0b101011, 0xabcd + Lit(Literal), // literal value, ex. for strings/comments. + Sep(Punctuation), // punctuation. non-word tokens. operators are lexed as this and later transformed to words. + Begin, End, Newline // scope indicators. can i use trees instead? should i use trees instead? +} + +#[derive(Clone, PartialEq)] +pub enum Literal { + Char(String), + SingleLineString(String), + MultiLineString(String), + Comment(String), + DocComment(String), } -/// All keywords that may continue a line. For knowing valid line splits. -const valid_continuations: [&str; 7] = ["and", "or", "xor", "in", "notin", "is", "isnot"]; +/// All punctuation recognized by the lexer. +/// Note the distinction between FuncLeftParen and TupleLeftParen. +#[derive(Clone, PartialEq)] +pub enum Punctuation { + Comma, // , + Period, // . + Semicolon, // ; + Colon, // : + BackTick, // ` + SingleQuote, // ' + DoubleQuote, // " + FuncLeftParen, // ( + FuncRightParen, // ) + TupleLeftParen, // ( + TupleRightParen, // ) + GenericLeftBracket, // [ + GenericRightBracket, // ] + ArrayLeftBracket, // [ + ArrayRightBracket, // ] + StructLeftBrace, // } + StructRightBrace, // } + Equals, // = + Plus, // + + Minus, // distinction between minus and negative. + Negative, // negative binds tightly: there is no whitespace following. + Times, // * + Slash, // / + LessThan, // < + GreaterThan, // > + At, // @ + Sha, // $ + Tilde, // ~ + And, // & + Percent, // % + Or, // | + Exclamation, // ! + Question, // ? + Caret, // ^ + Backslash, // \ +} /// Parses whitespace-sensitive code into an unambiguous TokenStream. /// Also useful for formatting. -// todo: support indentation within expressions -// nim: "As a rule of thumb, indentation within expressions is -// allowed after operators, an open parenthesis and after commas." +// todo: rewrite indentation parsing to do what nim does, annotate tokens with indentation preceding pub fn tokenize(input: &str) -> Result { // The design of this lexer utilizes to great extent multipeek's arbitrary peeking. // Tokens are matched by looping within their case until complete. - // This then eliminates the need for almost all global parser state. + // This then eliminates the need for most global parser state. (i hate state) use Token::*; - let mut start_of_line = true; // state - let mut indent_level = 0; // state - let mut indent_width = None; // state - let mut buf = String::new(); // buffer - let mut res = Vec::new(); // result + use Literal::*; + use Punctuation::*; + use LexicalError::*; + enum Paren { Func, Tuple } + enum Bracket { Generic, Array } + struct State { + start_of_line: bool, + indent_level: isize, + indent_width: isize, + paren_stack: Vec, + bracket_stack: Vec, + } + + let mut state = State { + start_of_line: true, + indent_level: 0, + indent_width: 0, + paren_stack: vec!(), + bracket_stack: vec!(), + }; + + let mut buf = String::new(); + let mut res = Vec::new(); // `char` in rust is four bytes it's fine let mut input = multipeek(input.chars()); while let Some(c) = input.next() { match c { ' ' => { - if start_of_line { // indentation + if state.start_of_line { // indentation let mut current_indent_level = 1; while let Some(x) = input.peek() { match x { @@ -47,38 +125,42 @@ pub fn tokenize(input: &str) -> Result { // really gross. this just checks if the previous token was a newline, // and that the token before it was punctuation or a known "operator", // and if so disregards indentation and treats it as a line continuation. - if let Some(&Sep('\n')) = res.get(res.len() - 1) { - if let Some(y) = res.get(res.len() - 2) { - if let Word(z) = y { - if valid_continuations.contains(&&z[..]) { + if let Some(Newline) = res.get(res.len() - 1) { + if let Some(prev) = res.get(res.len() - 2) { + match prev { // all keywords and punctuation that may continue a line + // workaround for https://github.com/rust-lang/rust/issues/87121 + Word(a) if a == "==" || a == "and" || a == "or" || + a == "xor" || a == "in" || a == "is" => { + res.pop(); + break; + }, + &Sep(FuncLeftParen) | Sep(GenericLeftBracket) | Sep(StructLeftBrace) | + Sep(TupleLeftParen) | Sep(ArrayLeftBracket) | Sep(Comma) => { res.pop(); break; } - } else if let Sep(_) = y { - res.pop(); - break; + _ => {} } } } // will only fire once. allows us to support X number of spaces so long as it's consistent - if indent_width.is_none() { - indent_width = Some(current_indent_level); + if state.indent_width == 0 { + state.indent_width = current_indent_level; } - let indent_width = indent_width.unwrap(); // safe. see above - if current_indent_level % indent_width != 0 { - return Err("indentation is offset".into()); + if current_indent_level % state.indent_width != 0 { + return Err(InvalidIndentation.into()); } - let diff = (current_indent_level as isize - indent_level as isize) / indent_width as isize; + let diff = (current_indent_level - state.indent_level) / state.indent_width; match diff { 0 => (), // same level of indentation 1 => res.push(Begin), // new level of indentation -1 => res.push(End), // old level of indentation - _ => return Err("indentation stepped by too much in one go".into()) + _ => return Err(InvalidIndentation.into()) // todo: support indentation in exprs } - indent_level = current_indent_level; + state.indent_level = current_indent_level; break; } } @@ -87,13 +169,12 @@ pub fn tokenize(input: &str) -> Result { while input.peek() == Some(&' ') { input.next(); } } }, - '\n' => { // newlines are separators - start_of_line = true; - res.push(Sep('\n')) + '\t' => return Err(InvalidIndentation.into()), + '\n' => { + state.start_of_line = true; + res.push(Newline) }, - c if c.is_whitespace() => return Err("tabs etc are not supported".into()), - '\'' => { // single quoted strings, i.e. chars - res.push(Sep('\'')); + '\'' => { // chars! while let Some(x) = input.next() { match x { '\'' => break, @@ -101,88 +182,170 @@ pub fn tokenize(input: &str) -> Result { _ => buf.push(x) } } - res.push(Lit(String::from(&buf))); - res.push(Sep('\'')); + res.push(Lit(Char(String::from(&buf)))); }, - '"' => { // triple quoted strings - if input.peek_nth(0) == Some(&'"') && - input.peek_nth(1) == Some(&'"') { - input.next(); input.next(); - res.push(Sep('"')); res.push(Sep('"')); res.push(Sep('"')); - while let Some(x) = input.next() { - match x { - '"' if input.peek_nth(1) == Some(&'"') && - input.peek_nth(2) == Some(&'"') => { - break; - }, - _ => buf.push(x) + '"' => { // strings! + match (input.peek_nth(0).copied(), input.peek_nth(1).copied()) { + (Some('"'), Some('"')) => { // triple quoted strings + input.next(); input.next(); + while let Some(x) = input.next() { + match x { + '"' if input.peek_nth(1) == Some(&'"') && + input.peek_nth(2) == Some(&'"') => { + input.next(); input.next(); + break; + }, + _ => buf.push(x) + } } - } - res.push(Lit(String::from(&buf))); - input.next(); input.next(); - res.push(Sep('"')); res.push(Sep('"')); res.push(Sep('"')); - } else { // regular strings - res.push(Sep('"')); - while let Some(x) = input.next() { - match x { - '"' => break, - '\\' => if let Some(y) = input.next() { buf.push(y) }, - _ => buf.push(x) + res.push(Lit(MultiLineString(String::from(&buf)))); + }, + (_, _) => { // single quoted strings + while let Some(x) = input.next() { + match x { + '"' => break, + '\\' => if let Some(y) = input.next() { buf.push(y) }, + _ => buf.push(x) + } } + res.push(Lit(SingleLineString(String::from(&buf)))); } - res.push(Lit(String::from(&buf))); - res.push(Sep('"')); } }, - '#' => { // block comment, can be nested - if input.peek() == Some(&'[') { - input.next(); - res.push(Sep('#')); res.push(Sep('[')); - let mut comment_level = 1; - while let Some(x) = input.next() && comment_level > 0 { - match x { - '#' if input.peek() == Some(&'[') => { - comment_level += 1; - input.next(); - }, - ']' if input.peek() == Some(&'#') => { - comment_level -= 1; - input.next(); - }, - _ => buf.push(x) + '#' => { // comments! + match input.peek() { + Some('[') => { // block comment, can be nested + input.next(); + let mut comment_level = 1; + while let Some(x) = input.next() && comment_level > 0 { + match x { + '#' if input.peek() == Some(&'[') => { + comment_level += 1; + input.next(); + }, + ']' if input.peek() == Some(&'#') => { + comment_level -= 1; + input.next(); + }, + _ => buf.push(x) + } } - } - res.push(Lit(String::from(&buf))); - res.push(Sep(']')); res.push(Sep('#')); - } else { // standard comment, runs until eol - res.push(Sep('#')); - while let Some(x) = input.peek() { - match x { - '\n' => break, - _ => { - buf.push(*x); - input.next(); + res.push(Lit(Comment(String::from(&buf)))); + }, + Some(&'#') => { // documentation comment + input.next(); + while let Some(x) = input.next() { + match x { + '\n' => break, + _ => { + buf.push(x); + } + } + } + res.push(Lit(DocComment(String::from(&buf)))); + }, + _ => { // standard comment, runs til EOL + while let Some(x) = input.next() { + match x { + '\n' => break, + _ => { + buf.push(x); + } } } + res.push(Lit(Comment(String::from(&buf)))); } - res.push(Lit(String::from(&buf))); } }, - 'a'..'z' | 'A'..'Z' | '0'..'9' | '_' => { // valid identifier - while let Some(x) = input.peek() { + 'a'..='z' | 'A'..='Z' | '_' => { // valid identifiers! + buf.push(c); // todo: unicode support + while let Some(x) = input.next() { match x { - 'a'..'z' | 'A'..'Z' | '0'..'9' | '_' => { - buf.push(*x); + 'a'..='z' | 'A'..='Z' | '0'..='9' | '_' => { + buf.push(x); + }, + _ => { + res.push(Word(String::from(&buf))); + match x { // () and [] denote both parameters/generics and tuples/arrays + '(' => { // we must disambiguate by treating those *directly* after words as such + res.push(Sep(FuncLeftParen)); + state.paren_stack.push(Paren::Func); + }, + '[' => { + res.push(Sep(GenericLeftBracket)); + state.bracket_stack.push(Bracket::Generic); + }, + _ => {}, + } + break; + } + } + } + }, + '0'..='9' => { // numeric literals! + buf.push(c); + while let Some(x) = input.next() { + match x { // todo: unicode support + 'a'..='z' | 'A'..='Z' | '0'..='9' | '_' => { + buf.push(x); input.next(); }, _ => break } } - res.push(Word(String::from(&buf))); + res.push(Num(String::from(&buf))) + }, + '-' => { // `-` is special. it can be the *prefix* operator "Negative", or part of a regular operator. + match input.peek() { + Some(' ') => res.push(Sep(Minus)), + _ => res.push(Sep(Negative)) + } + }, + '(' => { // note: FuncParens were matched above, directly after identifiers + res.push(Sep(TupleLeftParen)); + state.paren_stack.push(Paren::Tuple); + }, + '[' => { // note: GenericBrackets were matched above, directly after identifiers + res.push(Sep(ArrayLeftBracket)); + state.bracket_stack.push(Bracket::Array); + }, + ')' => { + match state.paren_stack.pop() { + Some(Paren::Func) => res.push(Sep(FuncRightParen)), + Some(Paren::Tuple) => res.push(Sep(TupleRightParen)), + None => return Err(MismatchedParens.into()), + } + }, + ']' => { + match state.bracket_stack.pop() { + Some(Bracket::Generic) => res.push(Sep(GenericRightBracket)), + Some(Bracket::Array) => res.push(Sep(ArrayRightBracket)), + None => return Err(MismatchedBrackets.into()), + } }, - '.' | ',' | ':' | ';' | // punctuation - '(' | ')' | '[' | ']' | '{' | '}' => res.push(Sep(c)), - _ => res.push(Sep(c)) // for now: treat unknown chars as Sep + ',' => res.push(Sep(Comma)), + '.' => res.push(Sep(Period)), + ';' => res.push(Sep(Semicolon)), + ':' => res.push(Sep(Colon)), + '`' => res.push(Sep(BackTick)), + '{' => res.push(Sep(StructLeftBrace)), + '}' => res.push(Sep(StructRightBrace)), + '=' => res.push(Sep(Equals)), + '+' => res.push(Sep(Plus)), + '*' => res.push(Sep(Times)), + '/' => res.push(Sep(Slash)), + '<' => res.push(Sep(LessThan)), + '>' => res.push(Sep(GreaterThan)), + '@' => res.push(Sep(At)), + '$' => res.push(Sep(Sha)), + '~' => res.push(Sep(Tilde)), + '&' => res.push(Sep(And)), + '|' => res.push(Sep(Or)), + '!' => res.push(Sep(Exclamation)), + '?' => res.push(Sep(Question)), + '^' => res.push(Sep(Caret)), + '\\' => res.push(Sep(Backslash)), + _ => return Err("unknown character".into()) // todo: support unicode! } buf.clear(); } diff --git a/src/main.rs b/src/main.rs index c366367..5cbdf00 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,8 +2,6 @@ #![feature(exclusive_range_pattern, let_chains)] mod lex; -// mod parse; -// mod check; mod tree; -fn main() { } +fn main() {} -- cgit v1.2.3-70-g09d2