diff options
Diffstat (limited to 'src/lex.rs')
-rw-r--r-- | src/lex.rs | 75 |
1 files changed, 61 insertions, 14 deletions
@@ -1,21 +1,34 @@ use multipeek::multipeek; -/// **Basic** syntax tokens. +pub type Result<T> = core::result::Result<T, Box<dyn std::error::Error>>; +pub type TokenStream = Vec<Token>; + +/// **Basic** syntax tokens. Form an unambiguous TokenStream. +#[derive(Clone, PartialEq)] pub enum Token { Word(String), // identifiers. Lit(String), // literal value, ex. for strings/comments. Sep(char), // punctuation. non-word tokens. - Ind(usize) // indentation. + Begin, End // scope indicators. } -/// Lexes a file into a Vec of fundamental Tokens. -pub fn tokenize(input: &str) -> Vec<Token> { +/// All keywords that may continue a line. For knowing valid line splits. +const valid_continuations: [&str; 7] = ["and", "or", "xor", "in", "notin", "is", "isnot"]; + +/// Parses whitespace-sensitive code into an unambiguous TokenStream. +/// Also useful for formatting. +// todo: support indentation within expressions +// nim: "As a rule of thumb, indentation within expressions is +// allowed after operators, an open parenthesis and after commas." +pub fn tokenize(input: &str) -> Result<TokenStream> { // The design of this lexer utilizes to great extent multipeek's arbitrary peeking. // Tokens are matched by looping within their case until complete. // This then eliminates the need for almost all global parser state. use Token::*; let mut start_of_line = true; // state + let mut indent_level = 0; // state + let mut indent_width = None; // state let mut buf = String::new(); // buffer let mut res = Vec::new(); // result @@ -24,19 +37,53 @@ pub fn tokenize(input: &str) -> Vec<Token> { while let Some(c) = input.next() { match c { ' ' => { - if start_of_line { // indentation, to be dealt with later - let mut indendation_level = 1; + if start_of_line { // indentation + let mut current_indent_level = 1; while let Some(x) = input.peek() { match x { - ' ' => indendation_level += 1, + ' ' => current_indent_level += 1, '\n' => break, // empty line - _ => { - res.push(Ind(indendation_level)); + _ => { // indentation ends + // really gross. this just checks if the previous token was a newline, + // and that the token before it was punctuation or a known "operator", + // and if so disregards indentation and treats it as a line continuation. + if let Some(&Sep('\n')) = res.get(res.len() - 1) { + if let Some(y) = res.get(res.len() - 2) { + if let Word(z) = y { + if valid_continuations.contains(&&z[..]) { + res.pop(); + break; + } + } else if let Sep(_) = y { + res.pop(); + break; + } + } + } + + // will only fire once. allows us to support X number of spaces so long as it's consistent + if indent_width.is_none() { + indent_width = Some(current_indent_level); + } + + let indent_width = indent_width.unwrap(); // safe. see above + if current_indent_level % indent_width != 0 { + return Err("indentation is offset".into()); + } + + let diff = (current_indent_level as isize - indent_level as isize) / indent_width as isize; + match diff { + 0 => (), // same level of indentation + 1 => res.push(Begin), // new level of indentation + -1 => res.push(End), // old level of indentation + _ => return Err("indentation stepped by too much in one go".into()) + } + indent_level = current_indent_level; break; } } } - } else { // get rid of whitespace + } else { // get rid of excess (all) whitespace while input.peek() == Some(&' ') { input.next(); } } }, @@ -44,7 +91,7 @@ pub fn tokenize(input: &str) -> Vec<Token> { start_of_line = true; res.push(Sep('\n')) }, - c if c.is_whitespace() => (), // tabs etc are not supported + c if c.is_whitespace() => return Err("tabs etc are not supported".into()), '\'' => { // single quoted strings, i.e. chars res.push(Sep('\'')); while let Some(x) = input.next() { @@ -133,11 +180,11 @@ pub fn tokenize(input: &str) -> Vec<Token> { } res.push(Word(String::from(&buf))); }, - '.' | ',' | ':' | ';' | - '(' | ')' | '[' | ']' | '{' | '}' => res.push(Sep(c)), // Sep + '.' | ',' | ':' | ';' | // punctuation + '(' | ')' | '[' | ']' | '{' | '}' => res.push(Sep(c)), _ => res.push(Sep(c)) // for now: treat unknown chars as Sep } buf.clear(); } - return res; + return Ok(res); } |