From 51c8b349a77a8d8b1b34ce8e03518dad6e3cba00 Mon Sep 17 00:00:00 2001 From: JJ Date: Sat, 6 Apr 2024 16:48:19 -0700 Subject: compiler: rewrite lexer, clean up ast --- src/frontend/ast.rs | 65 +++-- src/frontend/lex.rs | 682 +++++++++++++++++++++++++++----------------------- src/frontend/parse.rs | 218 +++++++++------- 3 files changed, 544 insertions(+), 421 deletions(-) diff --git a/src/frontend/ast.rs b/src/frontend/ast.rs index ed44712..b6ae7be 100644 --- a/src/frontend/ast.rs +++ b/src/frontend/ast.rs @@ -4,24 +4,25 @@ pub type Id = String; /// Puck's fundamental types. #[derive(Clone, PartialEq)] pub enum Type { - Void, Never, + Void, Never, Any, Integer, Float, String, // char et al are defined later - Func { // todo: multiple params, effects - from: Box, - to: Box + Func { // future: effects + from: Vec, + to: Vec }, - Struct(Vec<(Id, Box)>), - Tuple(Vec<(Option, Box)>), - Union(Vec<(Id, Box)>), + Struct(Vec<(Id, Type)>), + Tuple(Vec<(Option, Type)>), + Union(Vec<(Id, Type)>), Interface(Vec), - Array{size: usize, kind: Box}, + Array { + size: usize, + kind: Box + }, List(Box), Slice(Box), // todo: plus ownership - Reference(Box), - Pointer(Box), + Reference(Box), Pointer(Box), Distinct(Box), // todo: not sure - Mutable(Box), // parameters only - Static(Box), // parameters only + Mutable(Box), Static(Box), // parameters only Alias { // todo: this is wrong id: Id, generics: Vec @@ -31,11 +32,10 @@ pub enum Type { /// Function signatures. #[derive(Clone, PartialEq)] pub struct Sig { - pub effect: Option, pub id: Id, pub generics: Vec<(Id, Option)>, pub parameters: Vec, - pub kind: Option + pub kind: Type } /// Patterns are recognizable given zero context. @@ -50,8 +50,8 @@ pub enum Pattern { List(Vec), // arrays, slices, lists } -/// Expressions introduce a new binding or bindings, in some regard. -pub enum Binding { +/// Expressions introduce a new binding or bindings. +pub enum Binding { // todo: excessive use of Option Let { id: Pattern, // id: Pattern supports ex. `let (a, b) = ...` kind: Option, @@ -70,23 +70,22 @@ pub enum Binding { }, Func { public: bool, - effect: Option, id: Id, - generics: Vec<(Id, Option)>, // id, kind - parameters: Vec<(Id, Type)>, // id, kind + generics: Vec<(Id, Option)>, + parameters: Vec<(Id, Type)>, kind: Type, body: Vec }, Macro { public: bool, id: Id, - generics: Vec<(Id, Option)>, // id, kind - parameters: Vec<(Id, Option)>, // id, kind + generics: Vec<(Id, Option)>, + parameters: Vec<(Id, Option)>, kind: Option, body: Vec }, TypeDecl { id: Id, generics: Vec<(Id, Option)>, alias: Type }, - Import { from: Option, imports: Vec, alias: Option }, + Use { modules: Vec }, // todo: aliases? anything else fancy? Module { id: Id, body: Vec }, } @@ -95,12 +94,12 @@ pub enum Control { Call { id: Id, params: Vec }, // function calls, macro invocations, field access... If { branches: Vec, - else_body: Option> + else_body: Vec }, Try { body: Vec, catches: Vec, - finally: Option> + finally: Vec }, Match { item: Pattern, @@ -113,9 +112,19 @@ pub enum Control { Loop { body: Vec }, } -pub struct CondBranch { pub cond: Expr, pub body: Vec } -pub struct CatchBranch { pub exceptions: Vec<(Id, Option)>, pub body: Vec } -pub struct MatchBranch { pub patterns: Vec, pub guard: Option, pub body: Vec } +pub struct CondBranch { + pub cond: Expr, + pub body: Vec +} +pub struct CatchBranch { + pub exceptions: Vec<(Id, Option)>, + pub body: Vec +} +pub struct MatchBranch { + pub patterns: Vec, + pub guard: Option, + pub body: Vec +} /// Expressions are either Patterns, Bindings, or Control flow constructs. pub enum Expr { @@ -123,3 +132,5 @@ pub enum Expr { Binding(Binding), Control(Control), } + +pub type Ast = Vec; diff --git a/src/frontend/lex.rs b/src/frontend/lex.rs index e0aa76d..287ed9c 100644 --- a/src/frontend/lex.rs +++ b/src/frontend/lex.rs @@ -2,25 +2,41 @@ use multipeek::multipeek; use crate::*; -pub struct TokenStream(Vec); +/// We lex input into a TokenStream containing the position of the token and the Token itself. +pub struct TokenStream(Vec<(usize, Token)>); +/// Iterating over a TokenStream pops both the Tokens and the spans. +/// The parser only *really* cares about Tokens... spans are for use in error reporting. impl IntoIterator for TokenStream { - type Item = Token; - type IntoIter = std::vec::IntoIter; + type Item = (usize, Token); + type IntoIter = std::vec::IntoIter<(usize, Token)>; fn into_iter(self) -> Self::IntoIter { self.0.into_iter() } } +/// Basic syntax tokens. Form an unambiguous TokenStream. +#[derive(Clone, PartialEq)] +pub enum Token { + Key(Keyword), // keyword identifiers. ex. for, use, func... + Lit(Literal), // literal values. ex. for strings/comments. + Sep(Punctuation), // punctuation. non-word tokens. + Op(String), // operators. lexographically separate so the parser can parse infix notation + Num(String), // numeric values. ex. 413, 0b101011, 0xabcd, 612.1024 + Ident(String), // identifiers. variable names, function names, method calls... + Indent(usize), // indentation. denotes line breaks and scope at which a line starts + End // explicit scope end marker. inserted in normalization. +} + #[derive(Clone, PartialEq, Debug)] pub enum LexicalError { - InvalidIndentation, - MismatchedParens, - MismatchedBrackets, - UnknownPunctuation, + InvalidIndentation(usize), + MismatchedParens(usize), + MismatchedBrackets(usize), + UnknownPunctuation(usize), + EarlyEndOfInput, } - impl std::fmt::Display for LexicalError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{:?}", self) @@ -28,18 +44,7 @@ impl std::fmt::Display for LexicalError { } impl std::error::Error for LexicalError {} -/// **Basic** syntax tokens. Form an unambiguous TokenStream. -#[derive(Clone, PartialEq)] -pub enum Token { - Key(Keyword), // keyword identifiers. - Word(String), // non-keyword identifiers. - Op(String), // operators. separate tokens lexographically so the parser knows to parse infix notation. - Num(String), // numeric value, ex. 413, 0b101011, 0xabcd - Lit(Literal), // literal value, ex. for strings/comments. - Sep(Punctuation), // punctuation. non-word tokens. ~~operators are lexed as this and later transformed to words.~~ - Indent(usize), // indentation. denotes line breaks and scope at which a line starts. -} - +/// Literal text. This can be chars, strings, and comments. #[derive(Clone, PartialEq)] pub enum Literal { Char(String), @@ -51,7 +56,7 @@ pub enum Literal { } /// Keywords, made explicit for easier use with Rust. -/// (strings inside match patterns are fucky!!) +/// (static strings inside match patterns are fucky!!) #[derive(Clone, PartialEq)] pub enum Keyword { Pub, Let, Var, Const, @@ -86,8 +91,8 @@ pub enum Punctuation { GenericRightBracket, // ] ArrayLeftBracket, // [ ArrayRightBracket, // ] - ScopeLeftBrace, // { - ScopeRightBrace, // } + DependentLeftBrace, // { (not currently used) + DependentRightBrace, // } StructLeftBrace, // { StructRightBrace, // } Equals, // = @@ -110,320 +115,371 @@ pub enum Punctuation { Backslash, // \ } -/// Parses whitespace-sensitive code into an unambiguous TokenStream. -/// Also useful for formatting. -pub fn tokenize(input: &str) -> Result { - // The design of this lexer utilizes to great extent multipeek's arbitrary peeking. - // Tokens are matched by looping within their case until complete. - // This then eliminates the need for most global parser state. (i hate state) +struct Input<'a>(multipeek::MultiPeek>); - use Token::*; - use Literal::*; - use Punctuation::*; - use LexicalError::*; - enum Paren { Func, Tuple } - enum Bracket { Generic, Array } - struct State { - start_of_line: bool, - paren_stack: Vec, - bracket_stack: Vec, +impl<'a> Input<'a> { + /// Map input.next() to return Results for use with the propagation operator + fn next(&mut self) -> Result { + self.0.next().map(|x| x.1).ok_or(EarlyEndOfInput.into()) + } + + fn peek_eat(&mut self, expected: char) -> bool { + if self.peek().is_ok_and(|x| x == expected) { + self.0.next(); + true + } else { + false + } } + fn peek(&mut self) -> Result { + self.0.peek().copied().map(|x| x.1).ok_or(EarlyEndOfInput.into()) + } + + fn peek_nth(&mut self, n: usize) -> Result { + self.0.peek_nth(n).copied().map(|x| x.1).ok_or(EarlyEndOfInput.into()) + } + + fn consume_indentation(&mut self) -> usize { + let mut count = 0; + while self.peek_eat(' ') { + count += 1; + } + if self.peek_eat('\n') { + self.consume_indentation() + } else { + count + } + } + + fn strip(&mut self) { + loop { + if !self.peek_eat(' ') { + break + } + } + } +} + +use Token::*; +use Literal::*; +use Punctuation::*; +use LexicalError::*; +enum Paren { Func, Tuple } +enum Bracket { Generic, Array } +struct State { + start_of_line: bool, + paren_stack: Vec, + bracket_stack: Vec, + next_token_is_parameter_punctuation: bool, +} + +/// Parses whitespace-sensitive code into an unambiguous TokenStream. +pub fn lex(input: &str) -> Result { + let mut input = Input(multipeek(input.char_indices())); + let mut res = TokenStream(Vec::new()); let mut state = State { start_of_line: true, paren_stack: vec!(), bracket_stack: vec!(), + next_token_is_parameter_punctuation: false, }; + res.0.push((0, Indent(input.consume_indentation()))); + loop { + input.strip(); + match input.0.next() { + Some((pos, c)) => res.0.push((pos, lex_token(&mut input, &mut state, pos, c)?)), + None => break + } + } + Ok(res) +} - let mut buf = String::new(); - let mut res = Vec::new(); - - // `char` in rust is four bytes it's fine - let mut input = multipeek(input.chars()); - while let Some(c) = input.next() { - match c { - ' ' => { // indentation! and whitespace - match res.last() { - Some(Indent(_)) => { // indentation! - res.pop(); // discard previous empty or useless Indent token - let mut current_indent_level = 1; - while let Some(x) = input.peek() { - match x { - ' ' => { current_indent_level += 1; input.next(); }, - _ => match res.last() { // indentation ends - Some(Word(a)) if a == "==" || a == "and" || a == "or" || - a == "xor" || a == "in" || a == "is" => break, - Some(Sep(FuncLeftParen)) | Some(Sep(TupleLeftParen)) | - Some(Sep(GenericLeftBracket)) | Some(Sep(ArrayLeftBracket)) | - Some(Sep(StructLeftBrace)) | Some(Sep(Comma)) => break, - _ => { - res.push(Indent(current_indent_level)); - break; - } - } - } - } - }, - _ => { // get rid of excess (all) whitespace between words/operators - while input.peek().is_some_and(|x| x.is_whitespace() && x != &'\n') { input.next(); } +fn lex_token(input: &mut Input, state: &mut State, pos: usize, c: char) -> Result { + Ok(match c { + '\n' => Indent(input.consume_indentation()), + '\t' => return Err(InvalidIndentation(pos).into()), + ' ' => unreachable!(), + ':' => Sep(Colon), + '=' => Sep(Equals), + '\'' => lex_char(input)?, + '"' => lex_string(input)?, + '#' => lex_comment(input)?, + c if c.is_alphabetic() || c == '_' => lex_identifier(input, state, c), // valid identifiers! + c if c.is_digit(10) => lex_number(input, c), // numeric literals! + '-' => { + // `-` is special. it can be the *prefix* operator "Negative", or part of a regular operator. + match input.peek()? { + ' ' => Sep(Minus), + _ => Sep(Negative) + } + }, + '(' => { + if state.next_token_is_parameter_punctuation { + state.next_token_is_parameter_punctuation = false; + state.paren_stack.push(Paren::Func); + Sep(FuncLeftParen) + } else { + state.paren_stack.push(Paren::Tuple); + Sep(TupleLeftParen) + } + }, + '[' => { + if state.next_token_is_parameter_punctuation { + state.next_token_is_parameter_punctuation = false; + state.bracket_stack.push(Bracket::Generic); + Sep(GenericLeftBracket) + } else { + state.bracket_stack.push(Bracket::Array); + Sep(ArrayLeftBracket) + } + }, + ')' => { + match state.paren_stack.pop() { + Some(Paren::Func) => Sep(FuncRightParen), + Some(Paren::Tuple) => Sep(TupleRightParen), + None => return Err(MismatchedParens(pos).into()), + } + }, + ']' => { + match state.bracket_stack.pop() { + Some(Bracket::Generic) => { + if let Ok('(') = input.peek() { + state.next_token_is_parameter_punctuation = true; } + Sep(GenericRightBracket) + }, + Some(Bracket::Array) => Sep(ArrayRightBracket), + None => return Err(MismatchedBrackets(pos).into()), + } + }, + '`' => { // backticks are used for injections, so generics/parameters may follow + // todo: SERIOUS ambiguity issue with `injection`[] + match input.peek() { + Ok('(') | Ok('[') => { + state.next_token_is_parameter_punctuation = true; + }, + _ => () + } + Sep(BackTick) + }, + ',' => Sep(Comma), + '.' => Sep(Period), + ';' => Sep(Semicolon), + '{' => Sep(StructLeftBrace), + '}' => Sep(StructRightBrace), + '+' => Sep(Plus), + '*' => Sep(Times), + '/' => Sep(Slash), + '<' => Sep(LessThan), + '>' => Sep(GreaterThan), + '@' => Sep(At), + '$' => Sep(Sha), + '~' => Sep(Tilde), + '&' => Sep(And), + '|' => Sep(Or), + '!' => Sep(Exclamation), + '?' => Sep(Question), + '^' => Sep(Caret), + '\\' => Sep(Backslash), + _ => return Err(UnknownPunctuation(pos).into()) + }) +} + +/// Char ::= '\'' '\'' +fn lex_char(input: &mut Input) -> Result { + let mut buf = String::new(); + loop { + match input.next()? { + '\'' => break, + '\\' => { + if let y = input.next()? { + buf.push('\\'); + buf.push(y); } }, - '\t' => return Err(InvalidIndentation.into()), - '\n' => res.push(Indent(0)), - '\'' => { // chars! - while let Some(x) = input.next() { - match x { - '\'' => break, - '\\' => if let Some(y) = input.next() { buf.push(y) }, - _ => buf.push(x) - } + x => buf.push(x) + } + } + Ok(Lit(Char(buf))) +} + +/// String ::= '"' '"' | '"""' '"""' +fn lex_string(input: &mut Input) -> Result { + let mut buf = String::new(); + match (input.peek_nth(0), input.peek_nth(1)) { + (Ok('"'), Ok('"')) => { // triple quoted strings + input.next()?; input.next()?; + loop { + match input.next()? { + '"' if input.peek_nth(0)? == '"' && + input.peek_nth(1)? == '"' => { + input.next()?; input.next()?; + break; + }, + x => buf.push(x) } - res.push(Lit(Char(String::from(&buf)))); - }, - '"' => { // strings! - match (input.peek_nth(0).copied(), input.peek_nth(1).copied()) { - (Some('"'), Some('"')) => { // triple quoted strings - input.next(); input.next(); - while let Some(x) = input.next() { - match x { - '"' if input.peek_nth(0) == Some(&'"') && - input.peek_nth(1) == Some(&'"') => { - input.next(); input.next(); - break; - }, - _ => buf.push(x) - } - } - res.push(Lit(MultiLineString(String::from(&buf)))); + } + Ok(Lit(MultiLineString(buf))) + }, + _ => { // single quoted strings + loop { + match input.next()? { + '"' => break, + '\\' => if let y = input.next()? { + buf.push(y) // FIXME }, - (_, _) => { // single quoted strings - while let Some(x) = input.next() { - match x { - '"' => break, - '\\' => if let Some(y) = input.next() { buf.push(y) }, - _ => buf.push(x) - } - } - res.push(Lit(SingleLineString(String::from(&buf)))); - } + x => buf.push(x) } - }, - '#' => { // comments! - match input.peek() { - Some('[') => { // block comment, can be nested - input.next(); - let mut comment_level = 1; - while let Some(x) = input.next() && comment_level > 0 { - match x { - '#' if input.peek() == Some(&'[') => { - comment_level += 1; - input.next(); - }, - ']' if input.peek() == Some(&'#') => { - comment_level -= 1; - input.next(); - }, - _ => buf.push(x) - } - } - res.push(Lit(MultiLineComment(String::from(&buf)))); + } + Ok(Lit(SingleLineString(buf))) + } + } +} + +/// Comment ::= '#' '\n' | '#[' Comment | ']#' +fn lex_comment(input: &mut Input) -> Result { + let mut buf = String::new(); + match input.peek() { + Ok('[') => { // block comment, can be nested + input.next()?; + let mut comment_level = 1; + while comment_level > 0 { + match input.next()? { + '#' if input.peek_eat('[') => { + comment_level += 1; + buf.push_str("#["); }, - Some(&'#') => { // documentation comment - input.next(); - while let Some(x) = input.peek() { - match x { - '\n' => break, - _ => { - buf.push(*x); - } - } - input.next(); - } - res.push(Lit(DocComment(String::from(&buf)))); + ']' if input.peek_eat('#') => { + comment_level -= 1; + buf.push_str("]#"); }, - _ => { // standard comment, runs til EOL - while let Some(x) = input.peek() { - match x { - '\n' => break, - _ => { - buf.push(*x); - } - } - input.next(); - } - res.push(Lit(Comment(String::from(&buf)))); - } - } - }, - c if c.is_alphabetic() || c == '_' => { // valid identifiers! - buf.push(c); - while let Some(x) = input.peek() { - match x { - x if x.is_alphanumeric() || x == &'_' => { - buf.push(*x); - input.next(); - }, - _ => { - use Keyword::*; - match buf.as_str() { // keywords! - "pub" => res.push(Key(Pub)), - "let" => res.push(Key(Let)), - "var" => res.push(Key(Var)), - "const" => res.push(Key(Const)), - "func" => res.push(Key(Func)), - "macro" => res.push(Key(Macro)), - "type" => res.push(Key(Type)), - "mod" => res.push(Key(Mod)), - "use" => res.push(Key(Use)), - "as" => res.push(Key(As)), - "for" => res.push(Key(For)), - "while" => res.push(Key(While)), - "loop" => res.push(Key(Loop)), - "block" => res.push(Key(Block)), - "static" => res.push(Key(Static)), - "if" => res.push(Key(If)), - "when" => res.push(Key(When)), - "elif" => res.push(Key(Elif)), - "else" => res.push(Key(Else)), - "match" => res.push(Key(Match)), - "where" => res.push(Key(Where)), - "try" => res.push(Key(Try)), - "catch" => res.push(Key(Catch)), - "finally" => res.push(Key(Finally)), - "struct" => res.push(Key(Struct)), - "tuple" => res.push(Key(Tuple)), - "enum" => res.push(Key(Enum)), - "union" => res.push(Key(Union)), - "interface" => res.push(Key(Interface)), - "distinct" => res.push(Key(Distinct)), - "ref" => res.push(Key(Ref)), - "ptr" => res.push(Key(Ptr)), - "mut" => res.push(Key(Mut)), - "break" => res.push(Key(Break)), - "continue" => res.push(Key(Continue)), - "return" => res.push(Key(Return)), - "in" => res.push(Key(In)), - "is" => res.push(Key(Is)), - "of" => res.push(Key(Of)), - _ => res.push(Word(String::from(&buf))) - } - match x { // () and [] denote both parameters/generics and tuples/arrays - '(' => { // we must disambiguate by treating those *directly* after words as such - res.push(Sep(FuncLeftParen)); - state.paren_stack.push(Paren::Func); - input.next(); - }, - '[' => { - res.push(Sep(GenericLeftBracket)); - state.bracket_stack.push(Bracket::Generic); - input.next(); - }, - _ => {}, - } - break; - } + x => buf.push(x) + }; + } + Ok(Lit(MultiLineComment(buf))) + }, + Ok('#') => { // documentation comment + input.next()?; + while let Ok(x) = input.peek() { + match x { + '\n' => break, + _ => { + buf.push(x); + input.next()?; } } - }, - '0'..='9' => { // numeric literals! - buf.push(c); - while let Some(x) = input.peek() { - match x { - 'a'..='z' | 'A'..='Z' | '0'..='9' | '_' => { - buf.push(*x); - input.next(); - }, - _ => break + } + Ok(Lit(DocComment(buf))) + }, + _ => { // standard comment, runs til EOL + while let Ok(x) = input.peek() { + match x { + '\n' => break, + _ => { + buf.push(x); + input.next()?; } } - res.push(Num(String::from(&buf))) - }, - '-' => { // `-` is special. it can be the *prefix* operator "Negative", or part of a regular operator. - match input.peek() { - Some(' ') => res.push(Sep(Minus)), - _ => res.push(Sep(Negative)) - } - }, - '(' => { // note: FuncParens were matched above, directly after identifiers - res.push(Sep(TupleLeftParen)); - state.paren_stack.push(Paren::Tuple); - }, - '[' => { // note: GenericBrackets were matched above, directly after identifiers - res.push(Sep(ArrayLeftBracket)); - state.bracket_stack.push(Bracket::Array); - }, - ')' => { // match parens - match state.paren_stack.pop() { - Some(Paren::Func) => res.push(Sep(FuncRightParen)), - Some(Paren::Tuple) => res.push(Sep(TupleRightParen)), - None => return Err(MismatchedParens.into()), - } - }, - ']' => { // match brackets - match state.bracket_stack.pop() { - Some(Bracket::Generic) => res.push(Sep(GenericRightBracket)), - Some(Bracket::Array) => res.push(Sep(ArrayRightBracket)), - None => return Err(MismatchedBrackets.into()), - } - if input.peek() == Some(&'(') { // parameters following generics - res.push(Sep(FuncLeftParen)); - state.paren_stack.push(Paren::Func); - input.next(); - } + } + Ok(Lit(Comment(buf))) + } + } +} + +#[allow(unused_assignments)] +fn lex_identifier(input: &mut Input, state: &mut State, c: char) -> Token { + use Keyword::*; + let mut buf = String::new(); + let mut keyword = None; + buf.push(c); + loop { + match input.peek() { + Ok(x) if x.is_alphanumeric() || x == '_' => { + buf.push(x); + let _ = input.next(); }, - '`' => { // backticks are used for operators, so generics/parameters may follow - res.push(Sep(BackTick)); // todo: backticks could like not be used for operators - match input.peek() { - Some('(') => { - res.push(Sep(FuncLeftParen)); - state.paren_stack.push(Paren::Func); - input.next(); - }, - Some('[') => { - res.push(Sep(GenericLeftBracket)); - state.bracket_stack.push(Bracket::Generic); - input.next(); - }, - _ => {} - } + _ => { + keyword = Some(match buf.as_str() { // keywords! + "pub" => Key(Pub), + "let" => Key(Let), + "var" => Key(Var), + "const" => Key(Const), + "func" => Key(Func), + "macro" => Key(Macro), + "type" => Key(Type), + "mod" => Key(Mod), + "use" => Key(Use), + "as" => Key(As), + "for" => Key(For), + "while" => Key(While), + "loop" => Key(Loop), + "block" => Key(Block), + "static" => Key(Static), + "if" => Key(If), + "when" => Key(When), + "elif" => Key(Elif), + "else" => Key(Else), + "match" => Key(Match), + "where" => Key(Where), + "try" => Key(Try), + "catch" => Key(Catch), + "finally" => Key(Finally), + "struct" => Key(Struct), + "tuple" => Key(Tuple), + "enum" => Key(Enum), + "union" => Key(Union), + "interface" => Key(Interface), + "distinct" => Key(Distinct), + "ref" => Key(Ref), + "ptr" => Key(Ptr), + "mut" => Key(Mut), + "in" => Key(In), + "is" => Key(Is), + "of" => Key(Of), + "break" => Key(Break), + "continue" => Key(Continue), + "return" => Key(Return), + _ => Ident(String::from(&buf)) + }); + break + } + } + } + // () and [] denote both parameters/generics and tuples/arrays + // we must disambiguate by treating those *directly* after words as such + match input.peek() { + Ok('(') | Ok('[') => state.next_token_is_parameter_punctuation = true, + _ => (), + } + keyword.unwrap() +} + +fn lex_number(input: &mut Input, c: char) -> Token { + let mut buf = String::new(); + buf.push(c); + loop { + match input.peek() { + Ok(x) if x.is_alphanumeric() => { // FIXME + buf.push(x); + let _ = input.next(); }, - ',' => res.push(Sep(Comma)), - '.' => res.push(Sep(Period)), - ';' => res.push(Sep(Semicolon)), - ':' => res.push(Sep(Colon)), - '{' => res.push(Sep(StructLeftBrace)), - '}' => res.push(Sep(StructRightBrace)), - '=' => res.push(Sep(Equals)), - '+' => res.push(Sep(Plus)), - '*' => res.push(Sep(Times)), - '/' => res.push(Sep(Slash)), - '<' => res.push(Sep(LessThan)), - '>' => res.push(Sep(GreaterThan)), - '@' => res.push(Sep(At)), - '$' => res.push(Sep(Sha)), - '~' => res.push(Sep(Tilde)), - '&' => res.push(Sep(And)), - '|' => res.push(Sep(Or)), - '!' => res.push(Sep(Exclamation)), - '?' => res.push(Sep(Question)), - '^' => res.push(Sep(Caret)), - '\\' => res.push(Sep(Backslash)), - _ => return Err(UnknownPunctuation.into()) + _ => break } - buf.clear(); } - Ok(TokenStream(res)) + Num(buf) } impl std::fmt::Display for TokenStream { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use Token::*; let mut prev_token = Indent(0); - for token in &self.0 { - match (&prev_token, &token) { - (Word(_), Word(_)) | (Word(_), Num(_)) | - (Num(_), Word(_)) | (Num(_), Num(_)) => write!(f, " {}", token)?, + for (_, token) in &self.0 { + match (prev_token, token) { // fixme + (Ident(_), Ident(_)) | (Ident(_), Num(_)) | + (Num(_), Ident(_)) | (Num(_), Num(_)) | + (Key(_), Key(_)) | (Key(_), Ident(_)) | (Ident(_), Key(_)) + => write!(f, " {}", token)?, _ => write!(f, "{}", token)?, } prev_token = token.clone(); @@ -437,11 +493,11 @@ impl std::fmt::Display for Token { use Token::*; match self { Key(word) => write!(f, "{}", word), - Word(val) => write!(f, "{}", val), - Num(val) => write!(f, "{}", val), + Ident(val) | Op(val) | Num(val) => write!(f, "{}", val), Lit(lit) => write!(f, "{}", lit), Sep(sep) => write!(f, "{}", sep), Indent(i) => write!(f, "\n{}", " ".repeat(*i)), + End => write!(f, "end"), } } } @@ -526,8 +582,8 @@ impl std::fmt::Display for Punctuation { GenericRightBracket => write!(f, "]"), ArrayLeftBracket => write!(f, " ["), ArrayRightBracket => write!(f, "]"), - ScopeLeftBrace => write!(f, "{{"), - ScopeRightBrace => write!(f, "}}"), + DependentLeftBrace => write!(f, "{{"), + DependentRightBrace => write!(f, "}}"), StructLeftBrace => write!(f, "{{"), StructRightBrace => write!(f, "}}"), Equals => write!(f, "="), @@ -551,3 +607,9 @@ impl std::fmt::Display for Punctuation { } } } + +pub fn test() { + let input = std::fs::read_to_string("std/ast.pk").unwrap(); + let lexed = lex(&input).unwrap(); + println!("{}", lexed); +} diff --git a/src/frontend/parse.rs b/src/frontend/parse.rs index 8616346..352599b 100644 --- a/src/frontend/parse.rs +++ b/src/frontend/parse.rs @@ -9,17 +9,17 @@ use Token::*; use Literal::*; use Punctuation::*; -struct Input(multipeek::MultiPeek>); +struct Input(multipeek::MultiPeek>); impl Input { /// Map input.next() to return Results for use with the propagation operator fn next(&mut self) -> Result { - self.0.next().ok_or("end of input".into()) + self.0.next().map(|x| x.1).ok_or("end of input".into()) // FIXME } /// Check if the next character is an expected token, and if so, advance the iterator and return true - fn peek(&mut self, expected: Token) -> bool { - if self.0.peek() == Some(&expected) { + fn peek_eat(&mut self, expected: Token) -> bool { + if self.peek().is_some_and(|x| x == expected) { self.0.next(); true } else { @@ -28,13 +28,13 @@ impl Input { } /// Expose input.peek() (we don't want EOF to trigger an error when peeking) - fn peek_opt(&mut self) -> Option<&Token> { - self.0.peek() + fn peek(&mut self) -> Option { + self.0.peek().map(|x| x.1.clone()) } /// Expose input.peek_nth() - fn peek_nth(&mut self, n: usize) -> Option<&Token> { - self.0.peek_nth(n) + fn peek_nth(&mut self, n: usize) -> Option { + self.0.peek_nth(n).map(|x| x.1.clone()) } /// Asserts the next character to be a known token @@ -51,9 +51,13 @@ impl Input { } } +fn normalize(input: TokenStream) -> TokenStream { + todo!() +} + /// Convert a basic TokenStream into an AbstractSyntaxTree -pub fn astify(input: TokenStream, name: &str) -> Result { - let mut input = Input(multipeek(input)); +pub fn parse(input: TokenStream, name: &str) -> Result { + let mut input = Input(multipeek(normalize(input))); let body = parse_body(&mut input)?; if input.len() > 0 { return Err(format!("additional tokens remaining after the body!").into()); @@ -62,18 +66,18 @@ pub fn astify(input: TokenStream, name: &str) -> Result { } /// Parse a series of Exprs, for ex. the body of a function. -/// Body ::= Expr | ('{' Expr (';' Expr)* '}') -fn parse_body(input: &mut Input) -> Result> { +/// Body ::= Expr (';' Expr)* +fn parse_body(input: &mut Input) -> Result { let mut res = Vec::new(); - if !input.peek(Sep(ScopeLeftBrace)) { + if !input.peek_eat(Sep(Colon)) && !input.peek_eat(Sep(Equals)) { res.push(parse_expr(input)?); return Ok(res); } - while !input.peek(Sep(ScopeRightBrace)) { + while !input.peek_eat(End) { res.push(parse_expr(input)?); // consume semicolons. there doesn't *have* to be a semicolon though. // this should probably be checked to be a semicolon or a right brace. - input.peek(Sep(Semicolon)); + input.peek_eat(Sep(Semicolon)); } Ok(res) } @@ -136,7 +140,7 @@ fn parse_var(input: &mut Input) -> Result { let id = parse_pattern(input)?; let kind = parse_annotation(input)?; let mut value = None; - if input.peek(Sep(Equals)) { + if input.peek_eat(Sep(Equals)) { value = Some(Box::new(parse_expr(input)?)); } Ok(Expr::Binding(Var { id, kind, value })) @@ -153,7 +157,7 @@ fn parse_const(input: &mut Input, public: bool) -> Result { /// Annotation ::= (':' Type)? fn parse_annotation(input: &mut Input) -> Result> { - if input.peek(Sep(Colon)) { + if input.peek_eat(Sep(Colon)) { Ok(Some(parse_type(input)?)) } else { Ok(None) @@ -162,15 +166,14 @@ fn parse_annotation(input: &mut Input) -> Result> { /// `Func ::= 'pub'? 'func' Ident ('[' Parameters ']')? ('(' Parameters ')')? Annotation '=' Body` fn parse_func(input: &mut Input, public: bool) -> Result { - let effect = None; let id = parse_ident(input)?; let mut generics = Vec::new(); - if input.peek(Sep(GenericLeftBracket)) { + if input.peek_eat(Sep(GenericLeftBracket)) { generics = parse_parameters(input)?; input.then(Sep(GenericRightBracket))?; } let mut parameters = Vec::new(); - if input.peek(Sep(FuncLeftParen)) { // todo: rewrite to map over an input + if input.peek_eat(Sep(FuncLeftParen)) { // todo: rewrite to map over an input // let temp_parameters = parse_parameters(input)?; // if temp_parameters.last().is_none() { // return Err("expected a type annotation on the last function parameter".into()); @@ -183,7 +186,7 @@ fn parse_func(input: &mut Input, public: bool) -> Result { } else { stack.push(id); } - while input.peek(Sep(Comma)) { + while input.peek_eat(Sep(Comma)) { let (id, kind) = parse_parameter(input)?; stack.push(id); if kind.is_some() { @@ -199,29 +202,29 @@ fn parse_func(input: &mut Input, public: bool) -> Result { input.then(Sep(FuncRightParen))?; } let mut kind = Type::Void; - if input.peek(Sep(Colon)) { + if input.peek_eat(Sep(Colon)) { kind = parse_type(input)?; } input.then(Sep(Equals))?; let body = parse_body(input)?; - Ok(Expr::Binding(Func { public, effect, id, generics, parameters, kind, body })) + Ok(Expr::Binding(Func { public, id, generics, parameters, kind, body })) } /// `Macro ::= 'pub'? 'macro' Ident ('[' Parameters ']')? ('(' Parameters ')')? (':' Type) '=' Body` fn parse_macro(input: &mut Input, public: bool) -> Result { let id = parse_ident(input)?; let mut generics = Vec::new(); - if input.peek(Sep(GenericLeftBracket)) { + if input.peek_eat(Sep(GenericLeftBracket)) { generics = parse_parameters(input)?; input.then(Sep(GenericRightBracket))?; } let mut parameters = Vec::new(); - if input.peek(Sep(FuncLeftParen)) { + if input.peek_eat(Sep(FuncLeftParen)) { parameters = parse_parameters(input)?; input.then(Sep(FuncRightParen))?; } let mut kind = None; - if input.peek(Sep(Colon)) { + if input.peek_eat(Sep(Colon)) { kind = Some(parse_type(input)?); } input.then(Sep(Equals))?; @@ -233,7 +236,7 @@ fn parse_macro(input: &mut Input, public: bool) -> Result { fn parse_typedecl(input: &mut Input, public: bool) -> Result { let id = parse_ident(input)?; let mut generics = Vec::new(); - if input.peek(Sep(GenericLeftBracket)) { + if input.peek_eat(Sep(GenericLeftBracket)) { generics = parse_parameters(input)?; input.then(Sep(GenericRightBracket))?; } @@ -246,7 +249,7 @@ fn parse_typedecl(input: &mut Input, public: bool) -> Result { fn parse_parameter(input: &mut Input) -> Result<(Id, Option)> { let id = parse_ident(input)?; let mut kind = None; - if input.peek(Sep(Colon)) { + if input.peek_eat(Sep(Colon)) { kind = Some(parse_type(input)?); } Ok((id, kind)) @@ -256,7 +259,7 @@ fn parse_parameter(input: &mut Input) -> Result<(Id, Option)> { fn parse_parameters(input: &mut Input) -> Result)>> { let mut res = Vec::new(); res.push(parse_parameter(input)?); - while input.peek(Sep(Comma)) { + while input.peek_eat(Sep(Comma)) { res.push(parse_parameter(input)?); } Ok(res) @@ -281,7 +284,7 @@ fn parse_block(input: &mut Input) -> Result { let body = parse_body(input)?; Ok(Expr::Control(Block { id, body })) }, - Word(label) => { + Ident(label) => { input.then(Sep(Colon))?; let id = Some(label); let body = parse_body(input)?; @@ -327,12 +330,12 @@ fn parse_loop(input: &mut Input) -> Result { fn parse_if(input: &mut Input) -> Result { let mut branches = Vec::new(); branches.push(parse_cond_branch(input)?); - while input.peek(Key(Keyword::Elif)) { + while input.peek_eat(Key(Keyword::Elif)) { branches.push(parse_cond_branch(input)?); } - let mut else_body = None; - if input.peek(Key(Keyword::Else)) { - else_body = Some(parse_body(input)?); + let mut else_body = Vec::new(); + if input.peek_eat(Key(Keyword::Else)) { + else_body = parse_body(input)?; } Ok(Expr::Control(If { branches, else_body })) } @@ -341,13 +344,13 @@ fn parse_if(input: &mut Input) -> Result { fn parse_when(input: &mut Input) -> Result { let mut branches = Vec::new(); branches.push(parse_cond_branch(input)?); - while input.peek(Key(Keyword::Elif)) { + while input.peek_eat(Key(Keyword::Elif)) { branches.push(parse_cond_branch(input)?); } - let mut else_body = None; - if input.peek(Key(Keyword::Else)) { + let mut else_body = Vec::new(); + if input.peek_eat(Key(Keyword::Else)) { input.then(Sep(Colon))?; - else_body = Some(parse_body(input)?); + else_body = parse_body(input)?; } let mut body = Vec::new(); body.push(Expr::Control(If { branches, else_body })); @@ -367,20 +370,20 @@ fn parse_try(input: &mut Input) -> Result { input.then(Sep(Colon))?; let body = parse_body(input)?; let mut catches = Vec::new(); - while input.peek(Key(Keyword::Catch)) { + while input.peek_eat(Key(Keyword::Catch)) { let mut exceptions = Vec::new(); exceptions.push(parse_catch_exception(input)?); - while input.peek(Sep(Comma)) { + while input.peek_eat(Sep(Comma)) { exceptions.push(parse_catch_exception(input)?); } input.then(Sep(Colon))?; let body = parse_body(input)?; catches.push(CatchBranch { exceptions, body }); } - let mut finally = None; - if input.peek(Key(Keyword::Finally)) { + let mut finally = Vec::new(); + if input.peek_eat(Key(Keyword::Finally)) { input.then(Sep(Colon))?; - finally = Some(parse_body(input)?); + finally = parse_body(input)?; } Ok(Expr::Control(Try { body, catches, finally })) } @@ -389,7 +392,7 @@ fn parse_try(input: &mut Input) -> Result { fn parse_catch_exception(input: &mut Input) -> Result<(Id, Option)> { let id = parse_ident(input)?; let mut alias = None; - if input.peek(Key(Keyword::As)) { + if input.peek_eat(Key(Keyword::As)) { alias = Some(parse_ident(input)?); } Ok((id, alias)) @@ -399,14 +402,14 @@ fn parse_catch_exception(input: &mut Input) -> Result<(Id, Option)> { fn parse_match(input: &mut Input) -> Result { let item = parse_pattern(input)?; // fixme let mut branches = Vec::new(); - while input.peek(Key(Keyword::Of)) { + while input.peek_eat(Key(Keyword::Of)) { let mut patterns = Vec::new(); patterns.push(parse_pattern(input)?); - while input.peek(Sep(Comma)) { + while input.peek_eat(Sep(Comma)) { patterns.push(parse_pattern(input)?); } let mut guard = None; - if input.peek(Key(Keyword::Where)) { + if input.peek_eat(Key(Keyword::Where)) { guard = Some(parse_expr(input)?) } input.then(Sep(Colon))?; @@ -437,11 +440,11 @@ fn parse_type(input: &mut Input) -> Result { Keyword::Interface => parse_interface(input), _ => return Err("invalid keyword present in type!".into()) }, - Word(id) => { + Ident(id) => { let mut generics = Vec::new(); - if input.peek(Sep(GenericLeftBracket)) { + if input.peek_eat(Sep(GenericLeftBracket)) { generics.push(parse_type(input)?); - while input.peek(Sep(Comma)) { + while input.peek_eat(Sep(Comma)) { generics.push(parse_type(input)?); } input.then(Sep(GenericRightBracket))?; @@ -455,9 +458,9 @@ fn parse_type(input: &mut Input) -> Result { /// `StructType ::= 'struct' ('[' Ident ':' Type (',' Ident ':' Type)* ']')?` fn parse_struct_type(input: &mut Input) -> Result { let mut res = Vec::new(); - if input.peek(Sep(GenericLeftBracket)) { + if input.peek_eat(Sep(GenericLeftBracket)) { res.push(parse_struct_field(input)?); - while input.peek(Sep(Comma)) { + while input.peek_eat(Sep(Comma)) { res.push(parse_struct_field(input)?); } input.then(Sep(GenericRightBracket))?; @@ -465,19 +468,19 @@ fn parse_struct_type(input: &mut Input) -> Result { Ok(Type::Struct(res)) } -fn parse_struct_field(input: &mut Input) -> Result<(Id, Box)> { +fn parse_struct_field(input: &mut Input) -> Result<(Id, Type)> { let id = parse_ident(input)?; input.then(Sep(Colon))?; - let kind = Box::new(parse_type(input)?); + let kind = parse_type(input)?; Ok((id, kind)) } /// `TupleType ::= 'tuple' ('[' (Ident ':')? Type (',' (Ident ':')? Type)* ']')?` fn parse_tuple_type(input: &mut Input) -> Result { let mut res = Vec::new(); - if input.peek(Sep(GenericLeftBracket)) { + if input.peek_eat(Sep(GenericLeftBracket)) { res.push(parse_tuple_field(input)?); - while input.peek(Sep(Comma)) { + while input.peek_eat(Sep(Comma)) { res.push(parse_tuple_field(input)?); } input.then(Sep(GenericRightBracket))?; @@ -486,23 +489,23 @@ fn parse_tuple_type(input: &mut Input) -> Result { } // annoyingly complex to parse. `TupleField ::= (Ident ':')? Type` -fn parse_tuple_field(input: &mut Input) -> Result<(Option, Box)> { - match input.peek_opt().clone() { - Some(Word(id)) if input.peek_nth(1) == Some(&Sep(Colon)) => { +fn parse_tuple_field(input: &mut Input) -> Result<(Option, Type)> { + match input.peek().clone() { + Some(Ident(id)) if input.peek_nth(1) == Some(Sep(Colon)) => { input.next()?; input.then(Sep(Colon))?; - Ok((Some(id.to_string()), Box::new(parse_type(input)?))) + Ok((Some(id.to_string()), parse_type(input)?)) }, - _ => Ok((None, Box::new(parse_type(input)?))) + _ => Ok((None, parse_type(input)?)) } } /// `EnumType ::= 'enum' ('[' Ident ('=' Pattern)? (Ident ('=' Pattern)?)* ']')?` fn parse_enum_type(input: &mut Input) -> Result { let mut res = Vec::new(); - if input.peek(Sep(GenericLeftBracket)) { + if input.peek_eat(Sep(GenericLeftBracket)) { res.push(parse_enum_variant(input)?); - while input.peek(Sep(Comma)) { + while input.peek_eat(Sep(Comma)) { res.push(parse_enum_variant(input)?); } input.then(Sep(GenericRightBracket))?; @@ -513,7 +516,7 @@ fn parse_enum_type(input: &mut Input) -> Result { fn parse_enum_variant(input: &mut Input) -> Result<(Id, Option)> { let id = parse_ident(input)?; let mut kind = None; - if input.peek(Sep(Equals)) { + if input.peek_eat(Sep(Equals)) { kind = Some(parse_pattern(input)?); } Ok((id, kind)) @@ -522,9 +525,9 @@ fn parse_enum_variant(input: &mut Input) -> Result<(Id, Option)> { /// `UnionType ::= 'union' ('[' Ident (':' Type)? (',' Ident (':' Type)?)* ']')?` fn parse_union_type(input: &mut Input) -> Result { let mut res = Vec::new(); - if input.peek(Sep(GenericLeftBracket)) { + if input.peek_eat(Sep(GenericLeftBracket)) { res.push(parse_union_variant(input)?); - while input.peek(Sep(Comma)) { + while input.peek_eat(Sep(Comma)) { res.push(parse_union_variant(input)?); } input.then(Sep(GenericRightBracket))?; @@ -532,11 +535,11 @@ fn parse_union_type(input: &mut Input) -> Result { Ok(Type::Union(res)) } -fn parse_union_variant(input: &mut Input) -> Result<(Id, Box)> { +fn parse_union_variant(input: &mut Input) -> Result<(Id, Type)> { let id = parse_ident(input)?; - let mut kind = Box::new(Type::Alias { id: "unit".to_string(), generics: Vec::new() }); - if input.peek(Sep(Colon)) { - kind = Box::new(parse_type(input)?); + let mut kind = Type::Alias { id: "unit".to_string(), generics: Vec::new() }; + if input.peek_eat(Sep(Colon)) { + kind = parse_type(input)?; } Ok((id, kind)) } @@ -544,9 +547,9 @@ fn parse_union_variant(input: &mut Input) -> Result<(Id, Box)> { /// `Interface ::= 'interface' ('[' Signature (',' Signature)* ']')?` fn parse_interface(input: &mut Input) -> Result { let mut res = Vec::new(); - if input.peek(Sep(GenericLeftBracket)) { + if input.peek_eat(Sep(GenericLeftBracket)) { res.push(parse_signature(input)?); - while input.peek(Sep(Comma)) { + while input.peek_eat(Sep(Comma)) { res.push(parse_signature(input)?); } input.then(Sep(GenericRightBracket))?; @@ -556,31 +559,30 @@ fn parse_interface(input: &mut Input) -> Result { /// `Signature ::= Ident ('[' Parameters ']')? ('(' Type (',' Type)* ')')? (':' Type)?` fn parse_signature(input: &mut Input) -> Result { - let effect = None; let id = parse_ident(input)?; let mut generics = Vec::new(); - if input.peek(Sep(GenericLeftBracket)) { + if input.peek_eat(Sep(GenericLeftBracket)) { generics = parse_parameters(input)?; input.then(Sep(GenericRightBracket))?; } let mut parameters = Vec::new(); - if input.peek(Sep(FuncLeftParen)) { + if input.peek_eat(Sep(FuncLeftParen)) { parameters.push(parse_type(input)?); - if input.peek(Sep(Comma)) { + if input.peek_eat(Sep(Comma)) { parameters.push(parse_type(input)?); } input.then(Sep(FuncRightParen))?; } - let mut kind = None; - if input.peek(Sep(Colon)) { - kind = Some(parse_type(input)?); + let mut kind = Type::Void; + if input.peek_eat(Sep(Colon)) { + kind = parse_type(input)?; } - Ok(Sig { effect, id, generics, parameters, kind }) + Ok(Sig { id, generics, parameters, kind }) } /// `WrappedType ::= Type | ('[' Type ']')` fn parse_wrapped_type(input: &mut Input) -> Result { - if input.peek(Sep(GenericLeftBracket)) { + if input.peek_eat(Sep(GenericLeftBracket)) { let result = parse_type(input)?; input.then(Sep(GenericRightBracket))?; Ok(result) @@ -589,12 +591,60 @@ fn parse_wrapped_type(input: &mut Input) -> Result { } } -/// Pattern ::= Literal | Ident | '(' Pattern (',' Pattern)* ')' | Ident '(' Pattern (',' Pattern)* ')' -fn parse_pattern(input: &mut Input) -> Result { todo!() } +/// Pattern ::= Char | String | Number | Float | Ident | +/// '(' Pattern (',' Pattern)* ')' | Ident '(' Pattern (',' Pattern)* ')' +fn parse_pattern(input: &mut Input) -> Result { + // use Pattern::*; + match input.next()? { + Key(_) => todo!(), + Ident(id) => { + if input.peek() == Some(Sep(FuncLeftParen)) { + Ok(Pattern::Struct(todo!())) + } else { + Ok(Pattern::Ident(id)) + } + }, + Sep(FuncLeftParen) => { // fixme + parse_pattern(input)?; + while input.peek() == Some(Sep(Comma)) { + parse_pattern(input)?; + } + input.then(Sep(FuncRightParen))?; + todo!() + }, + Num(value) => Ok(Pattern::Number(todo!())), + Lit(val) => match val { + Literal::Char(val) => Ok(Pattern::Char(val.parse::()?)), + SingleLineString(val) | MultiLineString(val) => Ok(Pattern::String(val)), + token => Err(format!("expected literal but found token {}", token).into()) + }, + Sep(_) => todo!(), + _ => todo!() + } +} + +// fine to not parse operators until here tbh +fn parse_operator(input: &mut Input) -> Result { + match input.next()? { + Key(word) => Ok(word.to_string()), + Sep(tok) => { + let mut buf = String::new(); + // buf.push(tok); + loop { + match input.peek() { + Some(Key(_)) => todo!(), + _ => break + } + } + Ok(buf) + }, + token => Err(format!("expected operator but found token {}", token).into()) + } +} fn parse_ident(input: &mut Input) -> Result { match input.next()? { - Word(id) => Ok(id), + Ident(id) => Ok(id), token => Err(format!("expected identifier but found token {}", token).into()) } } -- cgit v1.2.3-70-g09d2