aboutsummaryrefslogtreecommitdiff
path: root/src/lex.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/lex.rs')
-rw-r--r--src/lex.rs75
1 files changed, 61 insertions, 14 deletions
diff --git a/src/lex.rs b/src/lex.rs
index bfd74ae..8999aea 100644
--- a/src/lex.rs
+++ b/src/lex.rs
@@ -1,21 +1,34 @@
use multipeek::multipeek;
-/// **Basic** syntax tokens.
+pub type Result<T> = core::result::Result<T, Box<dyn std::error::Error>>;
+pub type TokenStream = Vec<Token>;
+
+/// **Basic** syntax tokens. Form an unambiguous TokenStream.
+#[derive(Clone, PartialEq)]
pub enum Token {
Word(String), // identifiers.
Lit(String), // literal value, ex. for strings/comments.
Sep(char), // punctuation. non-word tokens.
- Ind(usize) // indentation.
+ Begin, End // scope indicators.
}
-/// Lexes a file into a Vec of fundamental Tokens.
-pub fn tokenize(input: &str) -> Vec<Token> {
+/// All keywords that may continue a line. For knowing valid line splits.
+const valid_continuations: [&str; 7] = ["and", "or", "xor", "in", "notin", "is", "isnot"];
+
+/// Parses whitespace-sensitive code into an unambiguous TokenStream.
+/// Also useful for formatting.
+// todo: support indentation within expressions
+// nim: "As a rule of thumb, indentation within expressions is
+// allowed after operators, an open parenthesis and after commas."
+pub fn tokenize(input: &str) -> Result<TokenStream> {
// The design of this lexer utilizes to great extent multipeek's arbitrary peeking.
// Tokens are matched by looping within their case until complete.
// This then eliminates the need for almost all global parser state.
use Token::*;
let mut start_of_line = true; // state
+ let mut indent_level = 0; // state
+ let mut indent_width = None; // state
let mut buf = String::new(); // buffer
let mut res = Vec::new(); // result
@@ -24,19 +37,53 @@ pub fn tokenize(input: &str) -> Vec<Token> {
while let Some(c) = input.next() {
match c {
' ' => {
- if start_of_line { // indentation, to be dealt with later
- let mut indendation_level = 1;
+ if start_of_line { // indentation
+ let mut current_indent_level = 1;
while let Some(x) = input.peek() {
match x {
- ' ' => indendation_level += 1,
+ ' ' => current_indent_level += 1,
'\n' => break, // empty line
- _ => {
- res.push(Ind(indendation_level));
+ _ => { // indentation ends
+ // really gross. this just checks if the previous token was a newline,
+ // and that the token before it was punctuation or a known "operator",
+ // and if so disregards indentation and treats it as a line continuation.
+ if let Some(&Sep('\n')) = res.get(res.len() - 1) {
+ if let Some(y) = res.get(res.len() - 2) {
+ if let Word(z) = y {
+ if valid_continuations.contains(&&z[..]) {
+ res.pop();
+ break;
+ }
+ } else if let Sep(_) = y {
+ res.pop();
+ break;
+ }
+ }
+ }
+
+ // will only fire once. allows us to support X number of spaces so long as it's consistent
+ if indent_width.is_none() {
+ indent_width = Some(current_indent_level);
+ }
+
+ let indent_width = indent_width.unwrap(); // safe. see above
+ if current_indent_level % indent_width != 0 {
+ return Err("indentation is offset".into());
+ }
+
+ let diff = (current_indent_level as isize - indent_level as isize) / indent_width as isize;
+ match diff {
+ 0 => (), // same level of indentation
+ 1 => res.push(Begin), // new level of indentation
+ -1 => res.push(End), // old level of indentation
+ _ => return Err("indentation stepped by too much in one go".into())
+ }
+ indent_level = current_indent_level;
break;
}
}
}
- } else { // get rid of whitespace
+ } else { // get rid of excess (all) whitespace
while input.peek() == Some(&' ') { input.next(); }
}
},
@@ -44,7 +91,7 @@ pub fn tokenize(input: &str) -> Vec<Token> {
start_of_line = true;
res.push(Sep('\n'))
},
- c if c.is_whitespace() => (), // tabs etc are not supported
+ c if c.is_whitespace() => return Err("tabs etc are not supported".into()),
'\'' => { // single quoted strings, i.e. chars
res.push(Sep('\''));
while let Some(x) = input.next() {
@@ -133,11 +180,11 @@ pub fn tokenize(input: &str) -> Vec<Token> {
}
res.push(Word(String::from(&buf)));
},
- '.' | ',' | ':' | ';' |
- '(' | ')' | '[' | ']' | '{' | '}' => res.push(Sep(c)), // Sep
+ '.' | ',' | ':' | ';' | // punctuation
+ '(' | ')' | '[' | ']' | '{' | '}' => res.push(Sep(c)),
_ => res.push(Sep(c)) // for now: treat unknown chars as Sep
}
buf.clear();
}
- return res;
+ return Ok(res);
}