From fbfeffad5d85b73102a0f4e5ed607c33319f09e6 Mon Sep 17 00:00:00 2001 From: JJ Date: Mon, 17 Jul 2023 22:11:40 -0700 Subject: compiler: implement a borrow checker free tree and begin lexing --- src/lex.rs | 121 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 src/lex.rs (limited to 'src/lex.rs') diff --git a/src/lex.rs b/src/lex.rs new file mode 100644 index 0000000..7c0cc21 --- /dev/null +++ b/src/lex.rs @@ -0,0 +1,121 @@ +use std::todo; +use crate::tree::Tree; +use multipeek::multipeek; + +/// **Basic** syntax tokens. +pub enum Token { + Word(String), + Lit(String), + Sep(char) +} + +/// Parses whitespace-sensitive code into an unambiguous syntax tree. +/// Also useful for formatting. +pub fn tokenize(input: &str) -> Tree> { + use Token::*; + let mut indendation_level = 0; + let mut buffer = String::new(); + let mut result = Tree::new(Vec::new()); + let ctx = result.data(result.root()); + + // `char` in rust is four bytes it's fine + let mut input = multipeek(input.chars()); + while let Some(c) = input.next() { + match c { + ' ' => todo!(), + '\n' => todo!(), + '\t' => todo!(), + '\'' => { + ctx.push(Sep('\'')); + while let Some(x) = input.next() { + if x == '\\' { + buffer.push(x); + continue; + } else if x == '\'' { + break; + } else { + buffer.push(x); + } + } + ctx.push(Lit(String::from(&buffer))); + ctx.push(Sep('\'')); + }, + '"' => { // triple quoted strings and regular strings + if input.peek_nth(0) == Some(&'"') && + input.peek_nth(1) == Some(&'"') { + input.next(); input.next(); + ctx.push(Sep('"')); ctx.push(Sep('"')); ctx.push(Sep('"')); + while let Some(c) = input.next() { + if c == '"' && + input.peek_nth(1) == Some(&'"') && + input.peek_nth(2) == Some(&'"') { + input.next(); input.next(); + break; + } + buffer.push(c); + } + ctx.push(Lit(String::from(&buffer))); + ctx.push(Sep('"')); ctx.push(Sep('"')); ctx.push(Sep('"')); + } else { + ctx.push(Sep('"')); + while let Some(x) = input.next() { + if x == '\\' { + buffer.push(x); + continue; + } else if x == '"' { + break; + } + buffer.push(c); + } + ctx.push(Lit(String::from(&buffer))); + ctx.push(Sep('"')); + } + }, + '#' => { + if input.peek() == Some(&'[') { + input.next(); + ctx.push(Sep('#')); ctx.push(Sep('[')); + let mut comment_level = 1; + while let Some(x) = input.next() && comment_level > 0 { + if x == '#' && input.peek() == Some(&'[') { + comment_level += 1; + input.next(); + } else if x == ']' && input.peek() == Some(&'#') { + comment_level -= 1; + input.next(); + } else { + buffer.push(x); + } + } + ctx.push(Lit(String::from(&buffer))); + ctx.push(Sep(']')); ctx.push(Sep('#')); + } else { + ctx.push(Sep('#')); + while let Some(x) = input.next() && x != '\n' { + buffer.push(x); + } + ctx.push(Lit(String::from(&buffer))); + } + }, + 'a'..'z' | 'A'..'Z' | '0'..'9' | '_' => { + while let Some(x) = input.peek() { + match x { + 'a'..'z' | 'A'..'Z' | '0'..'9' | '_' => { + buffer.push(c); + input.next(); + }, + _ => break + } + } + ctx.push(Word(String::from(&buffer))); + }, + '.' | ',' | ':' | ';' | + '(' | ')' | '[' | ']' | '{' | '}' => ctx.push(Sep(c)), + _ => ctx.push(Sep(c)) + } + buffer.clear(); + } + return result; +} + +// note: we can't have a TokenStream because there is significant whitespace. so, we construct the tree structure here, although we don't do anything fancy with the tokens. -- cgit v1.2.3-70-g09d2