aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJJ2023-07-18 22:45:44 +0000
committerJJ2023-07-18 22:45:44 +0000
commitf3b8cdaf74dac7533894222d6c1239cef898831b (patch)
tree5f6953b05ed6954ccf7293418a1262a434cf5e52 /src
parentfbfeffad5d85b73102a0f4e5ed607c33319f09e6 (diff)
compiler: complete basic lexer
Diffstat (limited to 'src')
-rw-r--r--src/lex.rs168
1 files changed, 95 insertions, 73 deletions
diff --git a/src/lex.rs b/src/lex.rs
index 7c0cc21..bfd74ae 100644
--- a/src/lex.rs
+++ b/src/lex.rs
@@ -1,121 +1,143 @@
-use std::todo;
-use crate::tree::Tree;
use multipeek::multipeek;
/// **Basic** syntax tokens.
pub enum Token {
- Word(String),
- Lit(String),
- Sep(char)
+ Word(String), // identifiers.
+ Lit(String), // literal value, ex. for strings/comments.
+ Sep(char), // punctuation. non-word tokens.
+ Ind(usize) // indentation.
}
-/// Parses whitespace-sensitive code into an unambiguous syntax tree.
-/// Also useful for formatting.
-pub fn tokenize(input: &str) -> Tree<Vec<Token>> {
+/// Lexes a file into a Vec of fundamental Tokens.
+pub fn tokenize(input: &str) -> Vec<Token> {
+ // The design of this lexer utilizes to great extent multipeek's arbitrary peeking.
+ // Tokens are matched by looping within their case until complete.
+ // This then eliminates the need for almost all global parser state.
+
use Token::*;
- let mut indendation_level = 0;
- let mut buffer = String::new();
- let mut result = Tree::new(Vec::new());
- let ctx = result.data(result.root());
+ let mut start_of_line = true; // state
+ let mut buf = String::new(); // buffer
+ let mut res = Vec::new(); // result
// `char` in rust is four bytes it's fine
let mut input = multipeek(input.chars());
while let Some(c) = input.next() {
match c {
- ' ' => todo!(),
- '\n' => todo!(),
- '\t' => todo!(),
- '\'' => {
- ctx.push(Sep('\''));
+ ' ' => {
+ if start_of_line { // indentation, to be dealt with later
+ let mut indendation_level = 1;
+ while let Some(x) = input.peek() {
+ match x {
+ ' ' => indendation_level += 1,
+ '\n' => break, // empty line
+ _ => {
+ res.push(Ind(indendation_level));
+ break;
+ }
+ }
+ }
+ } else { // get rid of whitespace
+ while input.peek() == Some(&' ') { input.next(); }
+ }
+ },
+ '\n' => { // newlines are separators
+ start_of_line = true;
+ res.push(Sep('\n'))
+ },
+ c if c.is_whitespace() => (), // tabs etc are not supported
+ '\'' => { // single quoted strings, i.e. chars
+ res.push(Sep('\''));
while let Some(x) = input.next() {
- if x == '\\' {
- buffer.push(x);
- continue;
- } else if x == '\'' {
- break;
- } else {
- buffer.push(x);
+ match x {
+ '\'' => break,
+ '\\' => if let Some(y) = input.next() { buf.push(y) },
+ _ => buf.push(x)
}
}
- ctx.push(Lit(String::from(&buffer)));
- ctx.push(Sep('\''));
+ res.push(Lit(String::from(&buf)));
+ res.push(Sep('\''));
},
- '"' => { // triple quoted strings and regular strings
+ '"' => { // triple quoted strings
if input.peek_nth(0) == Some(&'"') &&
input.peek_nth(1) == Some(&'"') {
input.next(); input.next();
- ctx.push(Sep('"')); ctx.push(Sep('"')); ctx.push(Sep('"'));
- while let Some(c) = input.next() {
- if c == '"' &&
- input.peek_nth(1) == Some(&'"') &&
- input.peek_nth(2) == Some(&'"') {
- input.next(); input.next();
- break;
+ res.push(Sep('"')); res.push(Sep('"')); res.push(Sep('"'));
+ while let Some(x) = input.next() {
+ match x {
+ '"' if input.peek_nth(1) == Some(&'"') &&
+ input.peek_nth(2) == Some(&'"') => {
+ break;
+ },
+ _ => buf.push(x)
}
- buffer.push(c);
}
- ctx.push(Lit(String::from(&buffer)));
- ctx.push(Sep('"')); ctx.push(Sep('"')); ctx.push(Sep('"'));
- } else {
- ctx.push(Sep('"'));
+ res.push(Lit(String::from(&buf)));
+ input.next(); input.next();
+ res.push(Sep('"')); res.push(Sep('"')); res.push(Sep('"'));
+ } else { // regular strings
+ res.push(Sep('"'));
while let Some(x) = input.next() {
- if x == '\\' {
- buffer.push(x);
- continue;
- } else if x == '"' {
- break;
+ match x {
+ '"' => break,
+ '\\' => if let Some(y) = input.next() { buf.push(y) },
+ _ => buf.push(x)
}
- buffer.push(c);
}
- ctx.push(Lit(String::from(&buffer)));
- ctx.push(Sep('"'));
+ res.push(Lit(String::from(&buf)));
+ res.push(Sep('"'));
}
},
- '#' => {
+ '#' => { // block comment, can be nested
if input.peek() == Some(&'[') {
input.next();
- ctx.push(Sep('#')); ctx.push(Sep('['));
+ res.push(Sep('#')); res.push(Sep('['));
let mut comment_level = 1;
while let Some(x) = input.next() && comment_level > 0 {
- if x == '#' && input.peek() == Some(&'[') {
- comment_level += 1;
- input.next();
- } else if x == ']' && input.peek() == Some(&'#') {
- comment_level -= 1;
- input.next();
- } else {
- buffer.push(x);
+ match x {
+ '#' if input.peek() == Some(&'[') => {
+ comment_level += 1;
+ input.next();
+ },
+ ']' if input.peek() == Some(&'#') => {
+ comment_level -= 1;
+ input.next();
+ },
+ _ => buf.push(x)
}
}
- ctx.push(Lit(String::from(&buffer)));
- ctx.push(Sep(']')); ctx.push(Sep('#'));
- } else {
- ctx.push(Sep('#'));
- while let Some(x) = input.next() && x != '\n' {
- buffer.push(x);
+ res.push(Lit(String::from(&buf)));
+ res.push(Sep(']')); res.push(Sep('#'));
+ } else { // standard comment, runs until eol
+ res.push(Sep('#'));
+ while let Some(x) = input.peek() {
+ match x {
+ '\n' => break,
+ _ => {
+ buf.push(*x);
+ input.next();
+ }
+ }
}
- ctx.push(Lit(String::from(&buffer)));
+ res.push(Lit(String::from(&buf)));
}
},
- 'a'..'z' | 'A'..'Z' | '0'..'9' | '_' => {
+ 'a'..'z' | 'A'..'Z' | '0'..'9' | '_' => { // valid identifier
while let Some(x) = input.peek() {
match x {
'a'..'z' | 'A'..'Z' | '0'..'9' | '_' => {
- buffer.push(c);
+ buf.push(*x);
input.next();
},
_ => break
}
}
- ctx.push(Word(String::from(&buffer)));
+ res.push(Word(String::from(&buf)));
},
'.' | ',' | ':' | ';' |
- '(' | ')' | '[' | ']' | '{' | '}' => ctx.push(Sep(c)),
- _ => ctx.push(Sep(c))
+ '(' | ')' | '[' | ']' | '{' | '}' => res.push(Sep(c)), // Sep
+ _ => res.push(Sep(c)) // for now: treat unknown chars as Sep
}
- buffer.clear();
+ buf.clear();
}
- return result;
+ return res;
}
-
-// note: we can't have a TokenStream because there is significant whitespace. so, we construct the tree structure here, although we don't do anything fancy with the tokens.