aboutsummaryrefslogtreecommitdiff
path: root/src/frontend/lex.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/frontend/lex.rs')
-rw-r--r--src/frontend/lex.rs542
1 files changed, 542 insertions, 0 deletions
diff --git a/src/frontend/lex.rs b/src/frontend/lex.rs
new file mode 100644
index 0000000..771ba38
--- /dev/null
+++ b/src/frontend/lex.rs
@@ -0,0 +1,542 @@
+use multipeek::multipeek;
+
+pub type Result<T> = core::result::Result<T, Box<dyn std::error::Error>>;
+pub struct TokenStream(Vec<Token>);
+
+impl IntoIterator for TokenStream {
+ type Item = Token;
+ type IntoIter = std::vec::IntoIter<Token>;
+
+ fn into_iter(self) -> Self::IntoIter {
+ self.0.into_iter()
+ }
+}
+
+#[derive(Clone, PartialEq, Debug)]
+pub enum LexicalError {
+ InvalidIndentation,
+ MismatchedParens,
+ MismatchedBrackets,
+ UnknownPunctuation,
+}
+
+impl std::fmt::Display for LexicalError {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(f, "{:?}", self)
+ }
+}
+impl std::error::Error for LexicalError {}
+
+/// **Basic** syntax tokens. Form an unambiguous TokenStream.
+#[derive(Clone, PartialEq)]
+pub enum Token {
+ Key(Keyword), // keyword identifiers.
+ Word(String), // non-keyword identifiers.
+ Num(String), // numeric value, ex. 413, 0b101011, 0xabcd
+ Lit(Literal), // literal value, ex. for strings/comments.
+ Sep(Punctuation), // punctuation. non-word tokens. operators are lexed as this and later transformed to words.
+ Indent(usize), // indentation. denotes line breaks and scope at which a line starts.
+}
+
+#[derive(Clone, PartialEq)]
+pub enum Literal {
+ Char(String),
+ SingleLineString(String),
+ MultiLineString(String),
+ Comment(String),
+ DocComment(String),
+ MultiLineComment(String),
+}
+
+/// Keywords, made explicit for easier use with Rust.
+/// (strings inside match patterns are fucky!!)
+#[derive(Clone, PartialEq)]
+pub enum Keyword {
+ Pub, Let, Var, Const,
+ Func, Macro, Type,
+ Mod, From, Import,
+ For, While, Loop,
+ Block, Static,
+ If, When, Elif, Else, Match,
+ Try, Catch, Finally,
+ Struct, Tuple, Enum, Union, Interface,
+ Distinct, Ref, // todo: Mut once figured out
+ Break, Continue, Return,
+ In, Is, Of, As,
+}
+
+/// All punctuation recognized by the lexer.
+/// Note the distinction between FuncLeftParen and TupleLeftParen.
+#[derive(Clone, PartialEq)]
+pub enum Punctuation {
+ Comma, // ,
+ Period, // .
+ Semicolon, // ;
+ Colon, // :
+ BackTick, // `
+ SingleQuote, // '
+ DoubleQuote, // "
+ FuncLeftParen, // (
+ FuncRightParen, // )
+ TupleLeftParen, // (
+ TupleRightParen, // )
+ GenericLeftBracket, // [
+ GenericRightBracket, // ]
+ ArrayLeftBracket, // [
+ ArrayRightBracket, // ]
+ StructLeftBrace, // }
+ StructRightBrace, // }
+ Equals, // =
+ Plus, // +
+ Minus, // distinction between minus and negative.
+ Negative, // negative binds tightly: there is no whitespace following.
+ Times, // *
+ Slash, // /
+ LessThan, // <
+ GreaterThan, // >
+ At, // @
+ Sha, // $
+ Tilde, // ~
+ And, // &
+ Percent, // %
+ Or, // |
+ Exclamation, // !
+ Question, // ?
+ Caret, // ^
+ Backslash, // \
+}
+
+/// Parses whitespace-sensitive code into an unambiguous TokenStream.
+/// Also useful for formatting.
+pub fn tokenize(input: &str) -> Result<TokenStream> {
+ // The design of this lexer utilizes to great extent multipeek's arbitrary peeking.
+ // Tokens are matched by looping within their case until complete.
+ // This then eliminates the need for most global parser state. (i hate state)
+
+ use Token::*;
+ use Literal::*;
+ use Punctuation::*;
+ use LexicalError::*;
+ enum Paren { Func, Tuple }
+ enum Bracket { Generic, Array }
+ struct State {
+ start_of_line: bool,
+ paren_stack: Vec<Paren>,
+ bracket_stack: Vec<Bracket>,
+ }
+
+ let mut state = State {
+ start_of_line: true,
+ paren_stack: vec!(),
+ bracket_stack: vec!(),
+ };
+
+ let mut buf = String::new();
+ let mut res = Vec::new();
+
+ // `char` in rust is four bytes it's fine
+ let mut input = multipeek(input.chars());
+ while let Some(c) = input.next() {
+ match c {
+ ' ' => { // indentation! and whitespace
+ match res.last() {
+ Some(Indent(_)) => { // indentation!
+ res.pop(); // discard previous empty or useless Indent token
+ let mut current_indent_level = 1;
+ while let Some(x) = input.peek() {
+ match x {
+ ' ' => { current_indent_level += 1; input.next(); },
+ _ => match res.last() { // indentation ends
+ Some(Word(a)) if a == "==" || a == "and" || a == "or" ||
+ a == "xor" || a == "in" || a == "is" => break,
+ Some(Sep(FuncLeftParen)) | Some(Sep(TupleLeftParen)) |
+ Some(Sep(GenericLeftBracket)) | Some(Sep(ArrayLeftBracket)) |
+ Some(Sep(StructLeftBrace)) | Some(Sep(Comma)) => break,
+ _ => {
+ res.push(Indent(current_indent_level));
+ break;
+ }
+ }
+ }
+ }
+ },
+ _ => { // get rid of excess (all) whitespace between words/operators
+ while input.peek().is_some_and(|x| x.is_whitespace() && x != &'\n') { input.next(); }
+ }
+ }
+ },
+ '\t' => return Err(InvalidIndentation.into()),
+ '\n' => res.push(Indent(0)),
+ '\'' => { // chars!
+ while let Some(x) = input.next() {
+ match x {
+ '\'' => break,
+ '\\' => if let Some(y) = input.next() { buf.push(y) },
+ _ => buf.push(x)
+ }
+ }
+ res.push(Lit(Char(String::from(&buf))));
+ },
+ '"' => { // strings!
+ match (input.peek_nth(0).copied(), input.peek_nth(1).copied()) {
+ (Some('"'), Some('"')) => { // triple quoted strings
+ input.next(); input.next();
+ while let Some(x) = input.next() {
+ match x {
+ '"' if input.peek_nth(0) == Some(&'"') &&
+ input.peek_nth(1) == Some(&'"') => {
+ input.next(); input.next();
+ break;
+ },
+ _ => buf.push(x)
+ }
+ }
+ res.push(Lit(MultiLineString(String::from(&buf))));
+ },
+ (_, _) => { // single quoted strings
+ while let Some(x) = input.next() {
+ match x {
+ '"' => break,
+ '\\' => if let Some(y) = input.next() { buf.push(y) },
+ _ => buf.push(x)
+ }
+ }
+ res.push(Lit(SingleLineString(String::from(&buf))));
+ }
+ }
+ },
+ '#' => { // comments!
+ match input.peek() {
+ Some('[') => { // block comment, can be nested
+ input.next();
+ let mut comment_level = 1;
+ while let Some(x) = input.next() && comment_level > 0 {
+ match x {
+ '#' if input.peek() == Some(&'[') => {
+ comment_level += 1;
+ input.next();
+ },
+ ']' if input.peek() == Some(&'#') => {
+ comment_level -= 1;
+ input.next();
+ },
+ _ => buf.push(x)
+ }
+ }
+ res.push(Lit(MultiLineComment(String::from(&buf))));
+ },
+ Some(&'#') => { // documentation comment
+ input.next();
+ while let Some(x) = input.peek() {
+ match x {
+ '\n' => break,
+ _ => {
+ buf.push(*x);
+ }
+ }
+ input.next();
+ }
+ res.push(Lit(DocComment(String::from(&buf))));
+ },
+ _ => { // standard comment, runs til EOL
+ while let Some(x) = input.peek() {
+ match x {
+ '\n' => break,
+ _ => {
+ buf.push(*x);
+ }
+ }
+ input.next();
+ }
+ res.push(Lit(Comment(String::from(&buf))));
+ }
+ }
+ },
+ c if c.is_alphabetic() || c == '_' => { // valid identifiers!
+ buf.push(c);
+ while let Some(x) = input.peek() {
+ match x {
+ x if x.is_alphanumeric() || x == &'_' => {
+ buf.push(*x);
+ input.next();
+ },
+ _ => {
+ use Keyword::*;
+ match buf.as_str() { // keywords!
+ "pub" => res.push(Key(Pub)),
+ "let" => res.push(Key(Let)),
+ "var" => res.push(Key(Var)),
+ "const" => res.push(Key(Const)),
+ "func" => res.push(Key(Func)),
+ "macro" => res.push(Key(Macro)),
+ "type" => res.push(Key(Type)),
+ "mod" => res.push(Key(Mod)),
+ "from" => res.push(Key(From)),
+ "import" => res.push(Key(Import)),
+ "for" => res.push(Key(For)),
+ "while" => res.push(Key(While)),
+ "loop" => res.push(Key(Loop)),
+ "block" => res.push(Key(Block)),
+ "static" => res.push(Key(Static)),
+ "if" => res.push(Key(If)),
+ "when" => res.push(Key(When)),
+ "elif" => res.push(Key(Elif)),
+ "else" => res.push(Key(Else)),
+ "match" => res.push(Key(Match)),
+ "try" => res.push(Key(Try)),
+ "catch" => res.push(Key(Catch)),
+ "finally" => res.push(Key(Finally)),
+ "struct" => res.push(Key(Struct)),
+ "tuple" => res.push(Key(Tuple)),
+ "enum" => res.push(Key(Enum)),
+ "union" => res.push(Key(Union)),
+ "interface" => res.push(Key(Interface)),
+ "distinct" => res.push(Key(Distinct)),
+ "ref" => res.push(Key(Ref)),
+ "break" => res.push(Key(Break)),
+ "continue" => res.push(Key(Continue)),
+ "return" => res.push(Key(Return)),
+ "in" => res.push(Key(In)),
+ "is" => res.push(Key(Is)),
+ "of" => res.push(Key(Of)),
+ "as" => res.push(Key(As)),
+ _ => res.push(Word(String::from(&buf)))
+ }
+ match x { // () and [] denote both parameters/generics and tuples/arrays
+ '(' => { // we must disambiguate by treating those *directly* after words as such
+ res.push(Sep(FuncLeftParen));
+ state.paren_stack.push(Paren::Func);
+ input.next();
+ },
+ '[' => {
+ res.push(Sep(GenericLeftBracket));
+ state.bracket_stack.push(Bracket::Generic);
+ input.next();
+ },
+ _ => {},
+ }
+ break;
+ }
+ }
+ }
+ },
+ '0'..='9' => { // numeric literals!
+ buf.push(c);
+ while let Some(x) = input.peek() {
+ match x {
+ 'a'..='z' | 'A'..='Z' | '0'..='9' | '_' => {
+ buf.push(*x);
+ input.next();
+ },
+ _ => break
+ }
+ }
+ res.push(Num(String::from(&buf)))
+ },
+ '-' => { // `-` is special. it can be the *prefix* operator "Negative", or part of a regular operator.
+ match input.peek() {
+ Some(' ') => res.push(Sep(Minus)),
+ _ => res.push(Sep(Negative))
+ }
+ },
+ '(' => { // note: FuncParens were matched above, directly after identifiers
+ res.push(Sep(TupleLeftParen));
+ state.paren_stack.push(Paren::Tuple);
+ },
+ '[' => { // note: GenericBrackets were matched above, directly after identifiers
+ res.push(Sep(ArrayLeftBracket));
+ state.bracket_stack.push(Bracket::Array);
+ },
+ ')' => {
+ match state.paren_stack.pop() {
+ Some(Paren::Func) => res.push(Sep(FuncRightParen)),
+ Some(Paren::Tuple) => res.push(Sep(TupleRightParen)),
+ None => return Err(MismatchedParens.into()),
+ }
+ },
+ ']' => {
+ match state.bracket_stack.pop() {
+ Some(Bracket::Generic) => res.push(Sep(GenericRightBracket)),
+ Some(Bracket::Array) => res.push(Sep(ArrayRightBracket)),
+ None => return Err(MismatchedBrackets.into()),
+ }
+ if input.peek() == Some(&'(') { // parameters following generics
+ res.push(Sep(FuncLeftParen));
+ state.paren_stack.push(Paren::Func);
+ input.next();
+ }
+ },
+ '`' => {
+ res.push(Sep(BackTick));
+ match input.peek() {
+ Some('(') => {
+ res.push(Sep(FuncLeftParen));
+ state.paren_stack.push(Paren::Func);
+ input.next();
+ },
+ Some('[') => {
+ res.push(Sep(GenericLeftBracket));
+ state.bracket_stack.push(Bracket::Generic);
+ input.next();
+ },
+ _ => {}
+ }
+ },
+ ',' => res.push(Sep(Comma)),
+ '.' => res.push(Sep(Period)),
+ ';' => res.push(Sep(Semicolon)),
+ ':' => res.push(Sep(Colon)),
+ '{' => res.push(Sep(StructLeftBrace)),
+ '}' => res.push(Sep(StructRightBrace)),
+ '=' => res.push(Sep(Equals)),
+ '+' => res.push(Sep(Plus)),
+ '*' => res.push(Sep(Times)),
+ '/' => res.push(Sep(Slash)),
+ '<' => res.push(Sep(LessThan)),
+ '>' => res.push(Sep(GreaterThan)),
+ '@' => res.push(Sep(At)),
+ '$' => res.push(Sep(Sha)),
+ '~' => res.push(Sep(Tilde)),
+ '&' => res.push(Sep(And)),
+ '|' => res.push(Sep(Or)),
+ '!' => res.push(Sep(Exclamation)),
+ '?' => res.push(Sep(Question)),
+ '^' => res.push(Sep(Caret)),
+ '\\' => res.push(Sep(Backslash)),
+ _ => return Err(UnknownPunctuation.into())
+ }
+ buf.clear();
+ }
+ Ok(TokenStream(res))
+}
+
+impl std::fmt::Display for TokenStream {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ use Token::*;
+ let mut prev_token = Indent(0);
+ for token in &self.0 {
+ match (&prev_token, &token) {
+ (Word(_), Word(_)) | (Word(_), Num(_)) |
+ (Num(_), Word(_)) | (Num(_), Num(_)) => write!(f, " {}", token)?,
+ _ => write!(f, "{}", token)?,
+ }
+ prev_token = token.clone();
+ }
+ Ok(())
+ }
+}
+
+impl std::fmt::Display for Token {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ use Token::*;
+ match self {
+ Key(word) => write!(f, "{}", word),
+ Word(val) => write!(f, "{}", val),
+ Num(val) => write!(f, "{}", val),
+ Lit(lit) => write!(f, "{}", lit),
+ Sep(sep) => write!(f, "{}", sep),
+ Indent(i) => write!(f, "\n{}", " ".repeat(*i)),
+ }
+ }
+}
+
+impl std::fmt::Display for Literal {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ use Literal::*;
+ match self {
+ Char(val) => write!(f, "'{}'", val),
+ SingleLineString(val) => write!(f, "\"{}\"", val),
+ MultiLineString(val) => write!(f, "\"\"\"{}\"\"\"", val),
+ Comment(val) => write!(f, "#{}", val),
+ DocComment(val) => write!(f, "##{}", val),
+ MultiLineComment(val) => write!(f, "#[{}]#", val),
+ }
+ }
+}
+
+impl std::fmt::Display for Keyword {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ use Keyword::*;
+ match self {
+ Pub => write!(f, "pub"),
+ Let => write!(f, "let"),
+ Var => write!(f, "var"),
+ Const => write!(f, "const"),
+ Func => write!(f, "func"),
+ Macro => write!(f, "macro"),
+ Type => write!(f, "type"),
+ Mod => write!(f, "mod"),
+ From => write!(f, "from"),
+ Import => write!(f, "import"),
+ For => write!(f, "for"),
+ While => write!(f, "while"),
+ Loop => write!(f, "loop"),
+ Block => write!(f, "block"),
+ Static => write!(f, "static"),
+ If => write!(f, "if"),
+ When => write!(f, "when"),
+ Elif => write!(f, "elif"),
+ Else => write!(f, "else"),
+ Match => write!(f, "match"),
+ Try => write!(f, "try"),
+ Catch => write!(f, "catch"),
+ Finally => write!(f, "finally"),
+ Struct => write!(f, "struct"),
+ Tuple => write!(f, "tuple"),
+ Enum => write!(f, "enum"),
+ Union => write!(f, "union"),
+ Interface => write!(f, "interface"),
+ Distinct => write!(f, "distinct"),
+ Ref => write!(f, "ref"),
+ Break => write!(f, "break"),
+ Continue => write!(f, "continue"),
+ Return => write!(f, "return"),
+ In => write!(f, "in"),
+ Is => write!(f, "is"),
+ Of => write!(f, "of"),
+ As => write!(f, "as"),
+ }
+ }
+}
+impl std::fmt::Display for Punctuation {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ use Punctuation::*;
+ match self {
+ Comma => write!(f, ","),
+ Period => write!(f, "."),
+ Semicolon => write!(f, ";"),
+ Colon => write!(f, ":"),
+ BackTick => write!(f, "`"),
+ SingleQuote => write!(f, "'"),
+ DoubleQuote => write!(f, "\""),
+ FuncLeftParen => write!(f, "("),
+ FuncRightParen => write!(f, ")"),
+ TupleLeftParen => write!(f, " ("),
+ TupleRightParen => write!(f, ")"),
+ GenericLeftBracket => write!(f, "["),
+ GenericRightBracket => write!(f, "]"),
+ ArrayLeftBracket => write!(f, " ["),
+ ArrayRightBracket => write!(f, "]"),
+ StructLeftBrace => write!(f, "{{"),
+ StructRightBrace => write!(f, "}}"),
+ Equals => write!(f, "="),
+ Plus => write!(f, "+"),
+ Minus => write!(f, "- "),
+ Negative => write!(f, "-"),
+ Times => write!(f, "*"),
+ Slash => write!(f, "/"),
+ LessThan => write!(f, "<"),
+ GreaterThan => write!(f, ">"),
+ At => write!(f, "@"),
+ Sha => write!(f, "$"),
+ Tilde => write!(f, "~"),
+ And => write!(f, "&"),
+ Percent => write!(f, "%"),
+ Or => write!(f, "|"),
+ Exclamation => write!(f, "!"),
+ Question => write!(f, "?"),
+ Caret => write!(f, "^"),
+ Backslash => write!(f, "\\"),
+ }
+ }
+}