aboutsummaryrefslogtreecommitdiff
path: root/src/lex.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/lex.rs')
-rw-r--r--src/lex.rs121
1 files changed, 121 insertions, 0 deletions
diff --git a/src/lex.rs b/src/lex.rs
new file mode 100644
index 0000000..7c0cc21
--- /dev/null
+++ b/src/lex.rs
@@ -0,0 +1,121 @@
+use std::todo;
+use crate::tree::Tree;
+use multipeek::multipeek;
+
+/// **Basic** syntax tokens.
+pub enum Token {
+ Word(String),
+ Lit(String),
+ Sep(char)
+}
+
+/// Parses whitespace-sensitive code into an unambiguous syntax tree.
+/// Also useful for formatting.
+pub fn tokenize(input: &str) -> Tree<Vec<Token>> {
+ use Token::*;
+ let mut indendation_level = 0;
+ let mut buffer = String::new();
+ let mut result = Tree::new(Vec::new());
+ let ctx = result.data(result.root());
+
+ // `char` in rust is four bytes it's fine
+ let mut input = multipeek(input.chars());
+ while let Some(c) = input.next() {
+ match c {
+ ' ' => todo!(),
+ '\n' => todo!(),
+ '\t' => todo!(),
+ '\'' => {
+ ctx.push(Sep('\''));
+ while let Some(x) = input.next() {
+ if x == '\\' {
+ buffer.push(x);
+ continue;
+ } else if x == '\'' {
+ break;
+ } else {
+ buffer.push(x);
+ }
+ }
+ ctx.push(Lit(String::from(&buffer)));
+ ctx.push(Sep('\''));
+ },
+ '"' => { // triple quoted strings and regular strings
+ if input.peek_nth(0) == Some(&'"') &&
+ input.peek_nth(1) == Some(&'"') {
+ input.next(); input.next();
+ ctx.push(Sep('"')); ctx.push(Sep('"')); ctx.push(Sep('"'));
+ while let Some(c) = input.next() {
+ if c == '"' &&
+ input.peek_nth(1) == Some(&'"') &&
+ input.peek_nth(2) == Some(&'"') {
+ input.next(); input.next();
+ break;
+ }
+ buffer.push(c);
+ }
+ ctx.push(Lit(String::from(&buffer)));
+ ctx.push(Sep('"')); ctx.push(Sep('"')); ctx.push(Sep('"'));
+ } else {
+ ctx.push(Sep('"'));
+ while let Some(x) = input.next() {
+ if x == '\\' {
+ buffer.push(x);
+ continue;
+ } else if x == '"' {
+ break;
+ }
+ buffer.push(c);
+ }
+ ctx.push(Lit(String::from(&buffer)));
+ ctx.push(Sep('"'));
+ }
+ },
+ '#' => {
+ if input.peek() == Some(&'[') {
+ input.next();
+ ctx.push(Sep('#')); ctx.push(Sep('['));
+ let mut comment_level = 1;
+ while let Some(x) = input.next() && comment_level > 0 {
+ if x == '#' && input.peek() == Some(&'[') {
+ comment_level += 1;
+ input.next();
+ } else if x == ']' && input.peek() == Some(&'#') {
+ comment_level -= 1;
+ input.next();
+ } else {
+ buffer.push(x);
+ }
+ }
+ ctx.push(Lit(String::from(&buffer)));
+ ctx.push(Sep(']')); ctx.push(Sep('#'));
+ } else {
+ ctx.push(Sep('#'));
+ while let Some(x) = input.next() && x != '\n' {
+ buffer.push(x);
+ }
+ ctx.push(Lit(String::from(&buffer)));
+ }
+ },
+ 'a'..'z' | 'A'..'Z' | '0'..'9' | '_' => {
+ while let Some(x) = input.peek() {
+ match x {
+ 'a'..'z' | 'A'..'Z' | '0'..'9' | '_' => {
+ buffer.push(c);
+ input.next();
+ },
+ _ => break
+ }
+ }
+ ctx.push(Word(String::from(&buffer)));
+ },
+ '.' | ',' | ':' | ';' |
+ '(' | ')' | '[' | ']' | '{' | '}' => ctx.push(Sep(c)),
+ _ => ctx.push(Sep(c))
+ }
+ buffer.clear();
+ }
+ return result;
+}
+
+// note: we can't have a TokenStream because there is significant whitespace. so, we construct the tree structure here, although we don't do anything fancy with the tokens.