From f3b8cdaf74dac7533894222d6c1239cef898831b Mon Sep 17 00:00:00 2001 From: JJ Date: Tue, 18 Jul 2023 15:45:44 -0700 Subject: compiler: complete basic lexer --- src/lex.rs | 168 ++++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 95 insertions(+), 73 deletions(-) diff --git a/src/lex.rs b/src/lex.rs index 7c0cc21..bfd74ae 100644 --- a/src/lex.rs +++ b/src/lex.rs @@ -1,121 +1,143 @@ -use std::todo; -use crate::tree::Tree; use multipeek::multipeek; /// **Basic** syntax tokens. pub enum Token { - Word(String), - Lit(String), - Sep(char) + Word(String), // identifiers. + Lit(String), // literal value, ex. for strings/comments. + Sep(char), // punctuation. non-word tokens. + Ind(usize) // indentation. } -/// Parses whitespace-sensitive code into an unambiguous syntax tree. -/// Also useful for formatting. -pub fn tokenize(input: &str) -> Tree> { +/// Lexes a file into a Vec of fundamental Tokens. +pub fn tokenize(input: &str) -> Vec { + // The design of this lexer utilizes to great extent multipeek's arbitrary peeking. + // Tokens are matched by looping within their case until complete. + // This then eliminates the need for almost all global parser state. + use Token::*; - let mut indendation_level = 0; - let mut buffer = String::new(); - let mut result = Tree::new(Vec::new()); - let ctx = result.data(result.root()); + let mut start_of_line = true; // state + let mut buf = String::new(); // buffer + let mut res = Vec::new(); // result // `char` in rust is four bytes it's fine let mut input = multipeek(input.chars()); while let Some(c) = input.next() { match c { - ' ' => todo!(), - '\n' => todo!(), - '\t' => todo!(), - '\'' => { - ctx.push(Sep('\'')); + ' ' => { + if start_of_line { // indentation, to be dealt with later + let mut indendation_level = 1; + while let Some(x) = input.peek() { + match x { + ' ' => indendation_level += 1, + '\n' => break, // empty line + _ => { + res.push(Ind(indendation_level)); + break; + } + } + } + } else { // get rid of whitespace + while input.peek() == Some(&' ') { input.next(); } + } + }, + '\n' => { // newlines are separators + start_of_line = true; + res.push(Sep('\n')) + }, + c if c.is_whitespace() => (), // tabs etc are not supported + '\'' => { // single quoted strings, i.e. chars + res.push(Sep('\'')); while let Some(x) = input.next() { - if x == '\\' { - buffer.push(x); - continue; - } else if x == '\'' { - break; - } else { - buffer.push(x); + match x { + '\'' => break, + '\\' => if let Some(y) = input.next() { buf.push(y) }, + _ => buf.push(x) } } - ctx.push(Lit(String::from(&buffer))); - ctx.push(Sep('\'')); + res.push(Lit(String::from(&buf))); + res.push(Sep('\'')); }, - '"' => { // triple quoted strings and regular strings + '"' => { // triple quoted strings if input.peek_nth(0) == Some(&'"') && input.peek_nth(1) == Some(&'"') { input.next(); input.next(); - ctx.push(Sep('"')); ctx.push(Sep('"')); ctx.push(Sep('"')); - while let Some(c) = input.next() { - if c == '"' && - input.peek_nth(1) == Some(&'"') && - input.peek_nth(2) == Some(&'"') { - input.next(); input.next(); - break; + res.push(Sep('"')); res.push(Sep('"')); res.push(Sep('"')); + while let Some(x) = input.next() { + match x { + '"' if input.peek_nth(1) == Some(&'"') && + input.peek_nth(2) == Some(&'"') => { + break; + }, + _ => buf.push(x) } - buffer.push(c); } - ctx.push(Lit(String::from(&buffer))); - ctx.push(Sep('"')); ctx.push(Sep('"')); ctx.push(Sep('"')); - } else { - ctx.push(Sep('"')); + res.push(Lit(String::from(&buf))); + input.next(); input.next(); + res.push(Sep('"')); res.push(Sep('"')); res.push(Sep('"')); + } else { // regular strings + res.push(Sep('"')); while let Some(x) = input.next() { - if x == '\\' { - buffer.push(x); - continue; - } else if x == '"' { - break; + match x { + '"' => break, + '\\' => if let Some(y) = input.next() { buf.push(y) }, + _ => buf.push(x) } - buffer.push(c); } - ctx.push(Lit(String::from(&buffer))); - ctx.push(Sep('"')); + res.push(Lit(String::from(&buf))); + res.push(Sep('"')); } }, - '#' => { + '#' => { // block comment, can be nested if input.peek() == Some(&'[') { input.next(); - ctx.push(Sep('#')); ctx.push(Sep('[')); + res.push(Sep('#')); res.push(Sep('[')); let mut comment_level = 1; while let Some(x) = input.next() && comment_level > 0 { - if x == '#' && input.peek() == Some(&'[') { - comment_level += 1; - input.next(); - } else if x == ']' && input.peek() == Some(&'#') { - comment_level -= 1; - input.next(); - } else { - buffer.push(x); + match x { + '#' if input.peek() == Some(&'[') => { + comment_level += 1; + input.next(); + }, + ']' if input.peek() == Some(&'#') => { + comment_level -= 1; + input.next(); + }, + _ => buf.push(x) } } - ctx.push(Lit(String::from(&buffer))); - ctx.push(Sep(']')); ctx.push(Sep('#')); - } else { - ctx.push(Sep('#')); - while let Some(x) = input.next() && x != '\n' { - buffer.push(x); + res.push(Lit(String::from(&buf))); + res.push(Sep(']')); res.push(Sep('#')); + } else { // standard comment, runs until eol + res.push(Sep('#')); + while let Some(x) = input.peek() { + match x { + '\n' => break, + _ => { + buf.push(*x); + input.next(); + } + } } - ctx.push(Lit(String::from(&buffer))); + res.push(Lit(String::from(&buf))); } }, - 'a'..'z' | 'A'..'Z' | '0'..'9' | '_' => { + 'a'..'z' | 'A'..'Z' | '0'..'9' | '_' => { // valid identifier while let Some(x) = input.peek() { match x { 'a'..'z' | 'A'..'Z' | '0'..'9' | '_' => { - buffer.push(c); + buf.push(*x); input.next(); }, _ => break } } - ctx.push(Word(String::from(&buffer))); + res.push(Word(String::from(&buf))); }, '.' | ',' | ':' | ';' | - '(' | ')' | '[' | ']' | '{' | '}' => ctx.push(Sep(c)), - _ => ctx.push(Sep(c)) + '(' | ')' | '[' | ']' | '{' | '}' => res.push(Sep(c)), // Sep + _ => res.push(Sep(c)) // for now: treat unknown chars as Sep } - buffer.clear(); + buf.clear(); } - return result; + return res; } - -// note: we can't have a TokenStream because there is significant whitespace. so, we construct the tree structure here, although we don't do anything fancy with the tokens. -- cgit v1.2.3-70-g09d2