aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJJ2023-10-26 08:57:33 +0000
committerJJ2023-10-26 08:57:33 +0000
commit7473fb09ed89e5e2fde8a7d6e89bc5d28151bdae (patch)
treefa1be73a3b87f30ca685c3a6c6f213b2c7d3ee50
parentd5f89ba0e2fd0458f4e51e13233069583d5a89ac (diff)
compiler: redesign lexer. still needs unicode support & expr indentation
-rw-r--r--src/lex.rs361
-rw-r--r--src/main.rs4
2 files changed, 263 insertions, 102 deletions
diff --git a/src/lex.rs b/src/lex.rs
index 8999aea..d3d5a7e 100644
--- a/src/lex.rs
+++ b/src/lex.rs
@@ -3,41 +3,119 @@ use multipeek::multipeek;
pub type Result<T> = core::result::Result<T, Box<dyn std::error::Error>>;
pub type TokenStream = Vec<Token>;
+#[derive(Clone, PartialEq, Debug)]
+pub enum LexicalError {
+ InvalidIndentation,
+ MismatchedParens,
+ MismatchedBrackets,
+}
+
+impl std::fmt::Display for LexicalError {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(f, "{:?}", self)
+ }
+}
+impl std::error::Error for LexicalError {}
+
/// **Basic** syntax tokens. Form an unambiguous TokenStream.
#[derive(Clone, PartialEq)]
pub enum Token {
Word(String), // identifiers.
- Lit(String), // literal value, ex. for strings/comments.
- Sep(char), // punctuation. non-word tokens.
- Begin, End // scope indicators.
+ Num(String), // numeric value, ex. 413, 0b101011, 0xabcd
+ Lit(Literal), // literal value, ex. for strings/comments.
+ Sep(Punctuation), // punctuation. non-word tokens. operators are lexed as this and later transformed to words.
+ Begin, End, Newline // scope indicators. can i use trees instead? should i use trees instead?
+}
+
+#[derive(Clone, PartialEq)]
+pub enum Literal {
+ Char(String),
+ SingleLineString(String),
+ MultiLineString(String),
+ Comment(String),
+ DocComment(String),
}
-/// All keywords that may continue a line. For knowing valid line splits.
-const valid_continuations: [&str; 7] = ["and", "or", "xor", "in", "notin", "is", "isnot"];
+/// All punctuation recognized by the lexer.
+/// Note the distinction between FuncLeftParen and TupleLeftParen.
+#[derive(Clone, PartialEq)]
+pub enum Punctuation {
+ Comma, // ,
+ Period, // .
+ Semicolon, // ;
+ Colon, // :
+ BackTick, // `
+ SingleQuote, // '
+ DoubleQuote, // "
+ FuncLeftParen, // (
+ FuncRightParen, // )
+ TupleLeftParen, // (
+ TupleRightParen, // )
+ GenericLeftBracket, // [
+ GenericRightBracket, // ]
+ ArrayLeftBracket, // [
+ ArrayRightBracket, // ]
+ StructLeftBrace, // }
+ StructRightBrace, // }
+ Equals, // =
+ Plus, // +
+ Minus, // distinction between minus and negative.
+ Negative, // negative binds tightly: there is no whitespace following.
+ Times, // *
+ Slash, // /
+ LessThan, // <
+ GreaterThan, // >
+ At, // @
+ Sha, // $
+ Tilde, // ~
+ And, // &
+ Percent, // %
+ Or, // |
+ Exclamation, // !
+ Question, // ?
+ Caret, // ^
+ Backslash, // \
+}
/// Parses whitespace-sensitive code into an unambiguous TokenStream.
/// Also useful for formatting.
-// todo: support indentation within expressions
-// nim: "As a rule of thumb, indentation within expressions is
-// allowed after operators, an open parenthesis and after commas."
+// todo: rewrite indentation parsing to do what nim does, annotate tokens with indentation preceding
pub fn tokenize(input: &str) -> Result<TokenStream> {
// The design of this lexer utilizes to great extent multipeek's arbitrary peeking.
// Tokens are matched by looping within their case until complete.
- // This then eliminates the need for almost all global parser state.
+ // This then eliminates the need for most global parser state. (i hate state)
use Token::*;
- let mut start_of_line = true; // state
- let mut indent_level = 0; // state
- let mut indent_width = None; // state
- let mut buf = String::new(); // buffer
- let mut res = Vec::new(); // result
+ use Literal::*;
+ use Punctuation::*;
+ use LexicalError::*;
+ enum Paren { Func, Tuple }
+ enum Bracket { Generic, Array }
+ struct State {
+ start_of_line: bool,
+ indent_level: isize,
+ indent_width: isize,
+ paren_stack: Vec<Paren>,
+ bracket_stack: Vec<Bracket>,
+ }
+
+ let mut state = State {
+ start_of_line: true,
+ indent_level: 0,
+ indent_width: 0,
+ paren_stack: vec!(),
+ bracket_stack: vec!(),
+ };
+
+ let mut buf = String::new();
+ let mut res = Vec::new();
// `char` in rust is four bytes it's fine
let mut input = multipeek(input.chars());
while let Some(c) = input.next() {
match c {
' ' => {
- if start_of_line { // indentation
+ if state.start_of_line { // indentation
let mut current_indent_level = 1;
while let Some(x) = input.peek() {
match x {
@@ -47,38 +125,42 @@ pub fn tokenize(input: &str) -> Result<TokenStream> {
// really gross. this just checks if the previous token was a newline,
// and that the token before it was punctuation or a known "operator",
// and if so disregards indentation and treats it as a line continuation.
- if let Some(&Sep('\n')) = res.get(res.len() - 1) {
- if let Some(y) = res.get(res.len() - 2) {
- if let Word(z) = y {
- if valid_continuations.contains(&&z[..]) {
+ if let Some(Newline) = res.get(res.len() - 1) {
+ if let Some(prev) = res.get(res.len() - 2) {
+ match prev { // all keywords and punctuation that may continue a line
+ // workaround for https://github.com/rust-lang/rust/issues/87121
+ Word(a) if a == "==" || a == "and" || a == "or" ||
+ a == "xor" || a == "in" || a == "is" => {
+ res.pop();
+ break;
+ },
+ &Sep(FuncLeftParen) | Sep(GenericLeftBracket) | Sep(StructLeftBrace) |
+ Sep(TupleLeftParen) | Sep(ArrayLeftBracket) | Sep(Comma) => {
res.pop();
break;
}
- } else if let Sep(_) = y {
- res.pop();
- break;
+ _ => {}
}
}
}
// will only fire once. allows us to support X number of spaces so long as it's consistent
- if indent_width.is_none() {
- indent_width = Some(current_indent_level);
+ if state.indent_width == 0 {
+ state.indent_width = current_indent_level;
}
- let indent_width = indent_width.unwrap(); // safe. see above
- if current_indent_level % indent_width != 0 {
- return Err("indentation is offset".into());
+ if current_indent_level % state.indent_width != 0 {
+ return Err(InvalidIndentation.into());
}
- let diff = (current_indent_level as isize - indent_level as isize) / indent_width as isize;
+ let diff = (current_indent_level - state.indent_level) / state.indent_width;
match diff {
0 => (), // same level of indentation
1 => res.push(Begin), // new level of indentation
-1 => res.push(End), // old level of indentation
- _ => return Err("indentation stepped by too much in one go".into())
+ _ => return Err(InvalidIndentation.into()) // todo: support indentation in exprs
}
- indent_level = current_indent_level;
+ state.indent_level = current_indent_level;
break;
}
}
@@ -87,13 +169,12 @@ pub fn tokenize(input: &str) -> Result<TokenStream> {
while input.peek() == Some(&' ') { input.next(); }
}
},
- '\n' => { // newlines are separators
- start_of_line = true;
- res.push(Sep('\n'))
+ '\t' => return Err(InvalidIndentation.into()),
+ '\n' => {
+ state.start_of_line = true;
+ res.push(Newline)
},
- c if c.is_whitespace() => return Err("tabs etc are not supported".into()),
- '\'' => { // single quoted strings, i.e. chars
- res.push(Sep('\''));
+ '\'' => { // chars!
while let Some(x) = input.next() {
match x {
'\'' => break,
@@ -101,88 +182,170 @@ pub fn tokenize(input: &str) -> Result<TokenStream> {
_ => buf.push(x)
}
}
- res.push(Lit(String::from(&buf)));
- res.push(Sep('\''));
+ res.push(Lit(Char(String::from(&buf))));
},
- '"' => { // triple quoted strings
- if input.peek_nth(0) == Some(&'"') &&
- input.peek_nth(1) == Some(&'"') {
- input.next(); input.next();
- res.push(Sep('"')); res.push(Sep('"')); res.push(Sep('"'));
- while let Some(x) = input.next() {
- match x {
- '"' if input.peek_nth(1) == Some(&'"') &&
- input.peek_nth(2) == Some(&'"') => {
- break;
- },
- _ => buf.push(x)
+ '"' => { // strings!
+ match (input.peek_nth(0).copied(), input.peek_nth(1).copied()) {
+ (Some('"'), Some('"')) => { // triple quoted strings
+ input.next(); input.next();
+ while let Some(x) = input.next() {
+ match x {
+ '"' if input.peek_nth(1) == Some(&'"') &&
+ input.peek_nth(2) == Some(&'"') => {
+ input.next(); input.next();
+ break;
+ },
+ _ => buf.push(x)
+ }
}
- }
- res.push(Lit(String::from(&buf)));
- input.next(); input.next();
- res.push(Sep('"')); res.push(Sep('"')); res.push(Sep('"'));
- } else { // regular strings
- res.push(Sep('"'));
- while let Some(x) = input.next() {
- match x {
- '"' => break,
- '\\' => if let Some(y) = input.next() { buf.push(y) },
- _ => buf.push(x)
+ res.push(Lit(MultiLineString(String::from(&buf))));
+ },
+ (_, _) => { // single quoted strings
+ while let Some(x) = input.next() {
+ match x {
+ '"' => break,
+ '\\' => if let Some(y) = input.next() { buf.push(y) },
+ _ => buf.push(x)
+ }
}
+ res.push(Lit(SingleLineString(String::from(&buf))));
}
- res.push(Lit(String::from(&buf)));
- res.push(Sep('"'));
}
},
- '#' => { // block comment, can be nested
- if input.peek() == Some(&'[') {
- input.next();
- res.push(Sep('#')); res.push(Sep('['));
- let mut comment_level = 1;
- while let Some(x) = input.next() && comment_level > 0 {
- match x {
- '#' if input.peek() == Some(&'[') => {
- comment_level += 1;
- input.next();
- },
- ']' if input.peek() == Some(&'#') => {
- comment_level -= 1;
- input.next();
- },
- _ => buf.push(x)
+ '#' => { // comments!
+ match input.peek() {
+ Some('[') => { // block comment, can be nested
+ input.next();
+ let mut comment_level = 1;
+ while let Some(x) = input.next() && comment_level > 0 {
+ match x {
+ '#' if input.peek() == Some(&'[') => {
+ comment_level += 1;
+ input.next();
+ },
+ ']' if input.peek() == Some(&'#') => {
+ comment_level -= 1;
+ input.next();
+ },
+ _ => buf.push(x)
+ }
}
- }
- res.push(Lit(String::from(&buf)));
- res.push(Sep(']')); res.push(Sep('#'));
- } else { // standard comment, runs until eol
- res.push(Sep('#'));
- while let Some(x) = input.peek() {
- match x {
- '\n' => break,
- _ => {
- buf.push(*x);
- input.next();
+ res.push(Lit(Comment(String::from(&buf))));
+ },
+ Some(&'#') => { // documentation comment
+ input.next();
+ while let Some(x) = input.next() {
+ match x {
+ '\n' => break,
+ _ => {
+ buf.push(x);
+ }
+ }
+ }
+ res.push(Lit(DocComment(String::from(&buf))));
+ },
+ _ => { // standard comment, runs til EOL
+ while let Some(x) = input.next() {
+ match x {
+ '\n' => break,
+ _ => {
+ buf.push(x);
+ }
}
}
+ res.push(Lit(Comment(String::from(&buf))));
}
- res.push(Lit(String::from(&buf)));
}
},
- 'a'..'z' | 'A'..'Z' | '0'..'9' | '_' => { // valid identifier
- while let Some(x) = input.peek() {
+ 'a'..='z' | 'A'..='Z' | '_' => { // valid identifiers!
+ buf.push(c); // todo: unicode support
+ while let Some(x) = input.next() {
match x {
- 'a'..'z' | 'A'..'Z' | '0'..'9' | '_' => {
- buf.push(*x);
+ 'a'..='z' | 'A'..='Z' | '0'..='9' | '_' => {
+ buf.push(x);
+ },
+ _ => {
+ res.push(Word(String::from(&buf)));
+ match x { // () and [] denote both parameters/generics and tuples/arrays
+ '(' => { // we must disambiguate by treating those *directly* after words as such
+ res.push(Sep(FuncLeftParen));
+ state.paren_stack.push(Paren::Func);
+ },
+ '[' => {
+ res.push(Sep(GenericLeftBracket));
+ state.bracket_stack.push(Bracket::Generic);
+ },
+ _ => {},
+ }
+ break;
+ }
+ }
+ }
+ },
+ '0'..='9' => { // numeric literals!
+ buf.push(c);
+ while let Some(x) = input.next() {
+ match x { // todo: unicode support
+ 'a'..='z' | 'A'..='Z' | '0'..='9' | '_' => {
+ buf.push(x);
input.next();
},
_ => break
}
}
- res.push(Word(String::from(&buf)));
+ res.push(Num(String::from(&buf)))
+ },
+ '-' => { // `-` is special. it can be the *prefix* operator "Negative", or part of a regular operator.
+ match input.peek() {
+ Some(' ') => res.push(Sep(Minus)),
+ _ => res.push(Sep(Negative))
+ }
+ },
+ '(' => { // note: FuncParens were matched above, directly after identifiers
+ res.push(Sep(TupleLeftParen));
+ state.paren_stack.push(Paren::Tuple);
+ },
+ '[' => { // note: GenericBrackets were matched above, directly after identifiers
+ res.push(Sep(ArrayLeftBracket));
+ state.bracket_stack.push(Bracket::Array);
+ },
+ ')' => {
+ match state.paren_stack.pop() {
+ Some(Paren::Func) => res.push(Sep(FuncRightParen)),
+ Some(Paren::Tuple) => res.push(Sep(TupleRightParen)),
+ None => return Err(MismatchedParens.into()),
+ }
+ },
+ ']' => {
+ match state.bracket_stack.pop() {
+ Some(Bracket::Generic) => res.push(Sep(GenericRightBracket)),
+ Some(Bracket::Array) => res.push(Sep(ArrayRightBracket)),
+ None => return Err(MismatchedBrackets.into()),
+ }
},
- '.' | ',' | ':' | ';' | // punctuation
- '(' | ')' | '[' | ']' | '{' | '}' => res.push(Sep(c)),
- _ => res.push(Sep(c)) // for now: treat unknown chars as Sep
+ ',' => res.push(Sep(Comma)),
+ '.' => res.push(Sep(Period)),
+ ';' => res.push(Sep(Semicolon)),
+ ':' => res.push(Sep(Colon)),
+ '`' => res.push(Sep(BackTick)),
+ '{' => res.push(Sep(StructLeftBrace)),
+ '}' => res.push(Sep(StructRightBrace)),
+ '=' => res.push(Sep(Equals)),
+ '+' => res.push(Sep(Plus)),
+ '*' => res.push(Sep(Times)),
+ '/' => res.push(Sep(Slash)),
+ '<' => res.push(Sep(LessThan)),
+ '>' => res.push(Sep(GreaterThan)),
+ '@' => res.push(Sep(At)),
+ '$' => res.push(Sep(Sha)),
+ '~' => res.push(Sep(Tilde)),
+ '&' => res.push(Sep(And)),
+ '|' => res.push(Sep(Or)),
+ '!' => res.push(Sep(Exclamation)),
+ '?' => res.push(Sep(Question)),
+ '^' => res.push(Sep(Caret)),
+ '\\' => res.push(Sep(Backslash)),
+ _ => return Err("unknown character".into()) // todo: support unicode!
}
buf.clear();
}
diff --git a/src/main.rs b/src/main.rs
index c366367..5cbdf00 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -2,8 +2,6 @@
#![feature(exclusive_range_pattern, let_chains)]
mod lex;
-// mod parse;
-// mod check;
mod tree;
-fn main() { }
+fn main() {}