use multipeek::multipeek; /// **Basic** syntax tokens. pub enum Token { Word(String), // identifiers. Lit(String), // literal value, ex. for strings/comments. Sep(char), // punctuation. non-word tokens. Ind(usize) // indentation. } /// Lexes a file into a Vec of fundamental Tokens. pub fn tokenize(input: &str) -> Vec { // The design of this lexer utilizes to great extent multipeek's arbitrary peeking. // Tokens are matched by looping within their case until complete. // This then eliminates the need for almost all global parser state. use Token::*; let mut start_of_line = true; // state let mut buf = String::new(); // buffer let mut res = Vec::new(); // result // `char` in rust is four bytes it's fine let mut input = multipeek(input.chars()); while let Some(c) = input.next() { match c { ' ' => { if start_of_line { // indentation, to be dealt with later let mut indendation_level = 1; while let Some(x) = input.peek() { match x { ' ' => indendation_level += 1, '\n' => break, // empty line _ => { res.push(Ind(indendation_level)); break; } } } } else { // get rid of whitespace while input.peek() == Some(&' ') { input.next(); } } }, '\n' => { // newlines are separators start_of_line = true; res.push(Sep('\n')) }, c if c.is_whitespace() => (), // tabs etc are not supported '\'' => { // single quoted strings, i.e. chars res.push(Sep('\'')); while let Some(x) = input.next() { match x { '\'' => break, '\\' => if let Some(y) = input.next() { buf.push(y) }, _ => buf.push(x) } } res.push(Lit(String::from(&buf))); res.push(Sep('\'')); }, '"' => { // triple quoted strings if input.peek_nth(0) == Some(&'"') && input.peek_nth(1) == Some(&'"') { input.next(); input.next(); res.push(Sep('"')); res.push(Sep('"')); res.push(Sep('"')); while let Some(x) = input.next() { match x { '"' if input.peek_nth(1) == Some(&'"') && input.peek_nth(2) == Some(&'"') => { break; }, _ => buf.push(x) } } res.push(Lit(String::from(&buf))); input.next(); input.next(); res.push(Sep('"')); res.push(Sep('"')); res.push(Sep('"')); } else { // regular strings res.push(Sep('"')); while let Some(x) = input.next() { match x { '"' => break, '\\' => if let Some(y) = input.next() { buf.push(y) }, _ => buf.push(x) } } res.push(Lit(String::from(&buf))); res.push(Sep('"')); } }, '#' => { // block comment, can be nested if input.peek() == Some(&'[') { input.next(); res.push(Sep('#')); res.push(Sep('[')); let mut comment_level = 1; while let Some(x) = input.next() && comment_level > 0 { match x { '#' if input.peek() == Some(&'[') => { comment_level += 1; input.next(); }, ']' if input.peek() == Some(&'#') => { comment_level -= 1; input.next(); }, _ => buf.push(x) } } res.push(Lit(String::from(&buf))); res.push(Sep(']')); res.push(Sep('#')); } else { // standard comment, runs until eol res.push(Sep('#')); while let Some(x) = input.peek() { match x { '\n' => break, _ => { buf.push(*x); input.next(); } } } res.push(Lit(String::from(&buf))); } }, 'a'..'z' | 'A'..'Z' | '0'..'9' | '_' => { // valid identifier while let Some(x) = input.peek() { match x { 'a'..'z' | 'A'..'Z' | '0'..'9' | '_' => { buf.push(*x); input.next(); }, _ => break } } res.push(Word(String::from(&buf))); }, '.' | ',' | ':' | ';' | '(' | ')' | '[' | ']' | '{' | '}' => res.push(Sep(c)), // Sep _ => res.push(Sep(c)) // for now: treat unknown chars as Sep } buf.clear(); } return res; }