use multipeek::multipeek; pub type Result = core::result::Result>; pub type TokenStream = Vec; /// **Basic** syntax tokens. Form an unambiguous TokenStream. #[derive(Clone, PartialEq)] pub enum Token { Word(String), // identifiers. Lit(String), // literal value, ex. for strings/comments. Sep(char), // punctuation. non-word tokens. Begin, End // scope indicators. } /// All keywords that may continue a line. For knowing valid line splits. const valid_continuations: [&str; 7] = ["and", "or", "xor", "in", "notin", "is", "isnot"]; /// Parses whitespace-sensitive code into an unambiguous TokenStream. /// Also useful for formatting. // todo: support indentation within expressions // nim: "As a rule of thumb, indentation within expressions is // allowed after operators, an open parenthesis and after commas." pub fn tokenize(input: &str) -> Result { // The design of this lexer utilizes to great extent multipeek's arbitrary peeking. // Tokens are matched by looping within their case until complete. // This then eliminates the need for almost all global parser state. use Token::*; let mut start_of_line = true; // state let mut indent_level = 0; // state let mut indent_width = None; // state let mut buf = String::new(); // buffer let mut res = Vec::new(); // result // `char` in rust is four bytes it's fine let mut input = multipeek(input.chars()); while let Some(c) = input.next() { match c { ' ' => { if start_of_line { // indentation let mut current_indent_level = 1; while let Some(x) = input.peek() { match x { ' ' => current_indent_level += 1, '\n' => break, // empty line _ => { // indentation ends // really gross. this just checks if the previous token was a newline, // and that the token before it was punctuation or a known "operator", // and if so disregards indentation and treats it as a line continuation. if let Some(&Sep('\n')) = res.get(res.len() - 1) { if let Some(y) = res.get(res.len() - 2) { if let Word(z) = y { if valid_continuations.contains(&&z[..]) { res.pop(); break; } } else if let Sep(_) = y { res.pop(); break; } } } // will only fire once. allows us to support X number of spaces so long as it's consistent if indent_width.is_none() { indent_width = Some(current_indent_level); } let indent_width = indent_width.unwrap(); // safe. see above if current_indent_level % indent_width != 0 { return Err("indentation is offset".into()); } let diff = (current_indent_level as isize - indent_level as isize) / indent_width as isize; match diff { 0 => (), // same level of indentation 1 => res.push(Begin), // new level of indentation -1 => res.push(End), // old level of indentation _ => return Err("indentation stepped by too much in one go".into()) } indent_level = current_indent_level; break; } } } } else { // get rid of excess (all) whitespace while input.peek() == Some(&' ') { input.next(); } } }, '\n' => { // newlines are separators start_of_line = true; res.push(Sep('\n')) }, c if c.is_whitespace() => return Err("tabs etc are not supported".into()), '\'' => { // single quoted strings, i.e. chars res.push(Sep('\'')); while let Some(x) = input.next() { match x { '\'' => break, '\\' => if let Some(y) = input.next() { buf.push(y) }, _ => buf.push(x) } } res.push(Lit(String::from(&buf))); res.push(Sep('\'')); }, '"' => { // triple quoted strings if input.peek_nth(0) == Some(&'"') && input.peek_nth(1) == Some(&'"') { input.next(); input.next(); res.push(Sep('"')); res.push(Sep('"')); res.push(Sep('"')); while let Some(x) = input.next() { match x { '"' if input.peek_nth(1) == Some(&'"') && input.peek_nth(2) == Some(&'"') => { break; }, _ => buf.push(x) } } res.push(Lit(String::from(&buf))); input.next(); input.next(); res.push(Sep('"')); res.push(Sep('"')); res.push(Sep('"')); } else { // regular strings res.push(Sep('"')); while let Some(x) = input.next() { match x { '"' => break, '\\' => if let Some(y) = input.next() { buf.push(y) }, _ => buf.push(x) } } res.push(Lit(String::from(&buf))); res.push(Sep('"')); } }, '#' => { // block comment, can be nested if input.peek() == Some(&'[') { input.next(); res.push(Sep('#')); res.push(Sep('[')); let mut comment_level = 1; while let Some(x) = input.next() && comment_level > 0 { match x { '#' if input.peek() == Some(&'[') => { comment_level += 1; input.next(); }, ']' if input.peek() == Some(&'#') => { comment_level -= 1; input.next(); }, _ => buf.push(x) } } res.push(Lit(String::from(&buf))); res.push(Sep(']')); res.push(Sep('#')); } else { // standard comment, runs until eol res.push(Sep('#')); while let Some(x) = input.peek() { match x { '\n' => break, _ => { buf.push(*x); input.next(); } } } res.push(Lit(String::from(&buf))); } }, 'a'..'z' | 'A'..'Z' | '0'..'9' | '_' => { // valid identifier while let Some(x) = input.peek() { match x { 'a'..'z' | 'A'..'Z' | '0'..'9' | '_' => { buf.push(*x); input.next(); }, _ => break } } res.push(Word(String::from(&buf))); }, '.' | ',' | ':' | ';' | // punctuation '(' | ')' | '[' | ']' | '{' | '}' => res.push(Sep(c)), _ => res.push(Sep(c)) // for now: treat unknown chars as Sep } buf.clear(); } return Ok(res); }