From 87d74952a614daa7075aeecef462ff51c4dc46e0 Mon Sep 17 00:00:00 2001 From: JJ Date: Tue, 31 Oct 2023 00:47:54 -0700 Subject: compiler: implement most of the parser --- src/ast.rs | 5 +- src/lex.rs | 107 +++++++++++++++++++- src/parse.rs | 322 +++++++++++++++++++++++++++++++++++++++++++++++------------ 3 files changed, 367 insertions(+), 67 deletions(-) diff --git a/src/ast.rs b/src/ast.rs index e55b473..6c7963e 100644 --- a/src/ast.rs +++ b/src/ast.rs @@ -59,6 +59,7 @@ pub enum Binding { value: Option> // variable bindings can be delayed }, Const { + public: bool, id: Pattern, kind: Option, value: Box @@ -83,7 +84,7 @@ pub struct ParamDecl { id: Id, kind: Type } /// Expressions related to control flow. pub enum Control { Call { id: Id, params: Vec }, // function calls, macro invocations, field access... - Cond { + If { branches: Vec, else_body: Option> }, @@ -93,7 +94,7 @@ pub enum Control { finally: Option> }, Match { - item: Box, + item: Pattern, branches: Vec }, Block { id: Option, body: Vec }, diff --git a/src/lex.rs b/src/lex.rs index 396c06d..771ba38 100644 --- a/src/lex.rs +++ b/src/lex.rs @@ -30,7 +30,8 @@ impl std::error::Error for LexicalError {} /// **Basic** syntax tokens. Form an unambiguous TokenStream. #[derive(Clone, PartialEq)] pub enum Token { - Word(String), // identifiers. + Key(Keyword), // keyword identifiers. + Word(String), // non-keyword identifiers. Num(String), // numeric value, ex. 413, 0b101011, 0xabcd Lit(Literal), // literal value, ex. for strings/comments. Sep(Punctuation), // punctuation. non-word tokens. operators are lexed as this and later transformed to words. @@ -47,6 +48,23 @@ pub enum Literal { MultiLineComment(String), } +/// Keywords, made explicit for easier use with Rust. +/// (strings inside match patterns are fucky!!) +#[derive(Clone, PartialEq)] +pub enum Keyword { + Pub, Let, Var, Const, + Func, Macro, Type, + Mod, From, Import, + For, While, Loop, + Block, Static, + If, When, Elif, Else, Match, + Try, Catch, Finally, + Struct, Tuple, Enum, Union, Interface, + Distinct, Ref, // todo: Mut once figured out + Break, Continue, Return, + In, Is, Of, As, +} + /// All punctuation recognized by the lexer. /// Note the distinction between FuncLeftParen and TupleLeftParen. #[derive(Clone, PartialEq)] @@ -243,7 +261,47 @@ pub fn tokenize(input: &str) -> Result { input.next(); }, _ => { - res.push(Word(String::from(&buf))); + use Keyword::*; + match buf.as_str() { // keywords! + "pub" => res.push(Key(Pub)), + "let" => res.push(Key(Let)), + "var" => res.push(Key(Var)), + "const" => res.push(Key(Const)), + "func" => res.push(Key(Func)), + "macro" => res.push(Key(Macro)), + "type" => res.push(Key(Type)), + "mod" => res.push(Key(Mod)), + "from" => res.push(Key(From)), + "import" => res.push(Key(Import)), + "for" => res.push(Key(For)), + "while" => res.push(Key(While)), + "loop" => res.push(Key(Loop)), + "block" => res.push(Key(Block)), + "static" => res.push(Key(Static)), + "if" => res.push(Key(If)), + "when" => res.push(Key(When)), + "elif" => res.push(Key(Elif)), + "else" => res.push(Key(Else)), + "match" => res.push(Key(Match)), + "try" => res.push(Key(Try)), + "catch" => res.push(Key(Catch)), + "finally" => res.push(Key(Finally)), + "struct" => res.push(Key(Struct)), + "tuple" => res.push(Key(Tuple)), + "enum" => res.push(Key(Enum)), + "union" => res.push(Key(Union)), + "interface" => res.push(Key(Interface)), + "distinct" => res.push(Key(Distinct)), + "ref" => res.push(Key(Ref)), + "break" => res.push(Key(Break)), + "continue" => res.push(Key(Continue)), + "return" => res.push(Key(Return)), + "in" => res.push(Key(In)), + "is" => res.push(Key(Is)), + "of" => res.push(Key(Of)), + "as" => res.push(Key(As)), + _ => res.push(Word(String::from(&buf))) + } match x { // () and [] denote both parameters/generics and tuples/arrays '(' => { // we must disambiguate by treating those *directly* after words as such res.push(Sep(FuncLeftParen)); @@ -372,6 +430,7 @@ impl std::fmt::Display for Token { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use Token::*; match self { + Key(word) => write!(f, "{}", word), Word(val) => write!(f, "{}", val), Num(val) => write!(f, "{}", val), Lit(lit) => write!(f, "{}", lit), @@ -395,6 +454,50 @@ impl std::fmt::Display for Literal { } } +impl std::fmt::Display for Keyword { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use Keyword::*; + match self { + Pub => write!(f, "pub"), + Let => write!(f, "let"), + Var => write!(f, "var"), + Const => write!(f, "const"), + Func => write!(f, "func"), + Macro => write!(f, "macro"), + Type => write!(f, "type"), + Mod => write!(f, "mod"), + From => write!(f, "from"), + Import => write!(f, "import"), + For => write!(f, "for"), + While => write!(f, "while"), + Loop => write!(f, "loop"), + Block => write!(f, "block"), + Static => write!(f, "static"), + If => write!(f, "if"), + When => write!(f, "when"), + Elif => write!(f, "elif"), + Else => write!(f, "else"), + Match => write!(f, "match"), + Try => write!(f, "try"), + Catch => write!(f, "catch"), + Finally => write!(f, "finally"), + Struct => write!(f, "struct"), + Tuple => write!(f, "tuple"), + Enum => write!(f, "enum"), + Union => write!(f, "union"), + Interface => write!(f, "interface"), + Distinct => write!(f, "distinct"), + Ref => write!(f, "ref"), + Break => write!(f, "break"), + Continue => write!(f, "continue"), + Return => write!(f, "return"), + In => write!(f, "in"), + Is => write!(f, "is"), + Of => write!(f, "of"), + As => write!(f, "as"), + } + } +} impl std::fmt::Display for Punctuation { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use Punctuation::*; diff --git a/src/parse.rs b/src/parse.rs index 9a60863..1dabd47 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -1,101 +1,297 @@ -use multipeek::*; +use std::fmt; + use crate::lex::*; -use crate::ast::Expr; +use crate::ast::*; use crate::ast::Binding::*; use crate::ast::Control::*; use crate::ast::Pattern::*; +use Token::*; +use Literal::*; +use Punctuation::*; + +type Input = std::iter::Peekable>; + +#[derive(Clone, Copy)] +struct State { + depth: usize, + step: usize +} + +impl State { + fn indent(&self) -> State { + State { depth: self.depth + self.step, step: self.step } + } + fn dedent(&self) -> State { + State { depth: self.depth - self.step, step: self.step } + } +} /// Convert a basic TokenStream into an AbstractSyntaxTree pub fn astify(input: TokenStream, name: &str) -> Result { - use Token::*; - use Literal::*; - use Punctuation::*; + let mut input = input.into_iter().peekable(); + let body = parse_body(&mut input, State { depth: 0, step: 0 })?; + Ok(Expr::Binding(Module{ id: name.to_string(), body })) +} - let mut input = multipeek(input); +/// Parse a series of Exprs, for ex. the body of a function. +fn parse_body(input: &mut Input, state: State) -> Result> { let mut res = Vec::new(); - while let Some(x) = input.peek() { - res.push(parse(&mut input, 0)?); + while input.peek() == Some(&Indent(state.depth)) { + input.next(); + res.push(parse_expr(input, state)?); } - Ok(Expr::Binding(Module{ id: name.to_string(), body: res })) + Ok(res) } -fn parse(input: &mut MultiPeek>, depth: usize) -> Result { - use Token::*; - use Literal::*; - use Punctuation::*; - let mut input = input; - match input.peek() { - Some(Word(val)) => match val.as_str() { - "pub" => { - input.next(); - if let Some(Word(val)) = input.peek() { - match val.as_str() { - "const" => parse_const(&mut input, true), - "func" => parse_func(&mut input, true), - "type" => parse_type(&mut input, true), - "mod" => parse_mod(&mut input, true), +/// Expr ::= Let | Var | Const | Func | Type | +/// Mod | Import | Block | Static | +/// For | While | Loop | If | When | Try | Match +fn parse_expr(input: &mut Input, state: State) -> Result { + use Keyword::*; + match input.next() { + Some(Key(word)) => match word { + Pub => { + match input.next() { + Some(Key(word)) => match word { + Const => parse_const(input, state, true), + Func => parse_funcdecl(input, state, true), + Type => parse_typedecl(input, state, true), + Mod => parse_mod(input, state, true), _ => return Err("unrecognized keyword following pub".into()), } - } else { - return Err("unrecognized thing following pub".into()); + Some(_) => return Err("unrecognized thing following pub".into()), + None => return Err("end of input".into()), } }, - "let" => parse_let(&mut input), - "var" => parse_var(&mut input), - "const" => parse_const(&mut input, false), - "func" => parse_func(&mut input, false), - "type" => parse_type(&mut input, false), - "mod" => parse_mod(&mut input, false), - "from" | "import" => parse_import(&mut input), - "block" => parse_block(&mut input), - "static" => parse_static(&mut input), - "for" => parse_for(&mut input), - "while" => parse_while(&mut input), - "loop" => parse_loop(&mut input), - "if" => parse_if(&mut input), - "when" => parse_when(&mut input), - "try" => parse_try(&mut input), - "match" => parse_match(&mut input), - _ => parse_line(&mut input), + Let => parse_let(input, state), + Var => parse_var(input, state), + Const => parse_const(input, state, false), + Func => parse_funcdecl(input, state, false), + Type => parse_typedecl(input, state, false), + Mod => parse_mod(input, state, false), + From => parse_import(input, state, true), + Import => parse_import(input, state, false), + Block => parse_block(input, state), + Static => parse_static(input, state), + For => parse_for(input, state), + While => parse_while(input, state), + Loop => parse_loop(input, state), + If => parse_if(input, state), + When => parse_when(input, state), + Try => parse_try(input, state), + Match => parse_match(input, state), + _ => return Err("invalid keyword starting expression".into()), }, - _ => parse_line(&mut input), + _ => todo!(), // what can i do with this?? match line here } } +/// Let ::= 'let' Pattern Annotation? '=' Expr +fn parse_let(input: &mut Input, state: State) -> Result { + let id = parse_pattern(input, state)?; + let mut kind = None; + if let Some(Sep(Colon)) = input.peek() { + input.next(); + kind = Some(parse_typedesc(input, state)?); + } + if input.next() != Some(Sep(Equals)) { + return Err("= not following binding".into()) + } + let value = Box::new(parse_expr(input, state)?); + Ok(Expr::Binding(Let { id, kind, value })) +} +/// Var ::= 'var' Pattern Annotation? ('=' Expr)? +fn parse_var(input: &mut Input, state: State) -> Result { + let id = parse_pattern(input, state)?; + let mut kind = None; + if let Some(Sep(Colon)) = input.peek() { + input.next(); + kind = Some(parse_typedesc(input, state)?); + } + let mut value = None; + if input.next() != Some(Sep(Equals)) { + value = Some(Box::new(parse_expr(input, state)?)); + } + Ok(Expr::Binding(Var { id, kind, value })) +} // Const ::= 'pub'? 'const' Pattern Annotation? '=' Expr -fn parse_const(input: &mut MultiPeek>, public: bool) -> Result { todo!() } +fn parse_const(input: &mut Input, state: State, public: bool) -> Result { + let id = parse_pattern(input, state)?; + let mut kind = None; + if let Some(Sep(Colon)) = input.peek() { + input.next(); + kind = Some(parse_typedesc(input, state)?); + } + if input.next() != Some(Sep(Equals)) { + return Err("= not following binding".into()) + } + let value = Box::new(parse_expr(input, state)?); + Ok(Expr::Binding(Const { public, id, kind, value })) +} // Func ::= 'pub'? ('func' | 'proc') Ident Generics? Parameters? (':' TypeDesc) '=' Body -fn parse_func(input: &mut MultiPeek>, public: bool) -> Result { todo!() } +fn parse_funcdecl(input: &mut Input, state: State, public: bool) -> Result { todo!() } // TypeDecl ::= 'pub'? 'type' Pattern Generics? '=' 'distinct'? 'ref'? TypeDesc -fn parse_type(input: &mut MultiPeek>, public: bool) -> Result { todo!() } +fn parse_typedecl(input: &mut Input, state: State, public: bool) -> Result { + let pattern = parse_pattern(input, state)?; + todo!() +} // Mod ::= 'pub'? 'mod' Ident ':' Body -fn parse_mod(input: &mut MultiPeek>, public: bool) -> Result { todo!() } +fn parse_mod(input: &mut Input, state: State, public: bool) -> Result { + match input.next() { + Some(Word(id)) => { + match input.next() { + Some(Sep(Colon)) => { + let body = parse_body(input, state.indent())?; + Ok(Expr::Binding(Module { id, body })) + }, + _ => return Err("unexpected token following mod label".into()), + } + }, + _ => return Err("unexpected thing following mod keyword".into()), + } +} -// Let ::= 'let' Pattern Annotation? '=' Expr -fn parse_let(input: &mut MultiPeek>) -> Result { todo!() } -// Var ::= 'var' Pattern Annotation? ('=' Expr)? -fn parse_var(input: &mut MultiPeek>) -> Result { todo!() } // Import ::= ('from' Ident)? 'import' Ident (',' Ident)* ('as' Ident)? -fn parse_import(input: &mut MultiPeek>) -> Result { todo!() } +fn parse_import(input: &mut Input, state: State, from_scope: bool) -> Result { + let mut from = None; + if from_scope { + match input.next() { + Some(Word(id)) => from = Some(id), + _ => return Err("identifier not following from keyword".into()) + } + if input.next() != Some(Key(Keyword::Import)) { + return Err("expected import to follow from".into()) + } + } + todo!() +} // Block ::= 'block' Ident? ':' Body -fn parse_block(input: &mut MultiPeek>) -> Result { todo!() } +fn parse_block(input: &mut Input, state: State) -> Result { // todo: body + offset + match input.next() { + Some(Sep(Colon)) => { + let id = None; + let body = parse_body(input, state.indent())?; + Ok(Expr::Control(Block { id, body })) + }, + Some(Word(label)) => { + match input.next() { + Some(Sep(Colon)) => { + let id = Some(label); + let body = parse_body(input, state.indent())?; + Ok(Expr::Control(Block { id, body })) + }, + _ => return Err("unexpected token following block label".into()), + } + }, + _ => return Err("unexpected thing following block keyword".into()), + } +} // Static ::= 'static' ':' Body -fn parse_static(input: &mut MultiPeek>) -> Result { todo!() } +fn parse_static(input: &mut Input, state: State) -> Result { + if input.next() != Some(Sep(Colon)) { + return Err("colon must follow static invocation".into()); + } + let body = parse_body(input, state.indent())?; + Ok(Expr::Control(Static { body })) +} + // For ::= 'for' Pattern 'in' Expr ':' Body -fn parse_for(input: &mut MultiPeek>) -> Result { todo!() } +fn parse_for(input: &mut Input, state: State) -> Result { + let binding = parse_pattern(input, state)?; + if input.next() != Some(Key(Keyword::In)) { + return Err("expected in keyword after for pattern".into()); + } + let range = Box::new(parse_expr(input, state)?); + if input.next() != Some(Sep(Colon)) { + return Err("expected colon after in expression".into()); + } + let body = parse_body(input, state.indent())?; + Ok(Expr::Control(For { binding, range, body })) +} // While ::= 'while' Expr ':' Body -fn parse_while(input: &mut MultiPeek>) -> Result { todo!() } +fn parse_while(input: &mut Input, state: State) -> Result { + let cond = Box::new(parse_expr(input, state)?); + if input.next() != Some(Sep(Colon)) { + return Err("expected colon after while keyword".into()); + } + let body = parse_body(input, state.indent())?; + Ok(Expr::Control(While { cond, body })) +} // Loop ::= 'loop' ':' Body -fn parse_loop(input: &mut MultiPeek>) -> Result { todo!() } +fn parse_loop(input: &mut Input, state: State) -> Result { + if input.next() != Some(Sep(Colon)) { + return Err("expected colon after loop keyword".into()); + } + let body = parse_body(input, state.indent())?; + Ok(Expr::Control(Loop { body })) +} + // If ::= 'if' Expr ':' Body ('elif' Expr ':' Body)* ('else' ':' Body)? -fn parse_if(input: &mut MultiPeek>) -> Result { todo!() } +fn parse_if(input: &mut Input, state: State) -> Result { + let mut branches = Vec::new(); + branches.push(parse_cond_branch(input, state)?); + while input.peek() == Some(&Key(Keyword::Elif)) { + input.next(); + branches.push(parse_cond_branch(input, state)?); + } + let mut else_body = None; + if input.peek() == Some(&Key(Keyword::Else)) { + input.next(); + else_body = Some(parse_body(input, state.indent())?); + } + Ok(Expr::Control(If { branches, else_body })) +} // When ::= 'when' Expr ':' Body ('elif' Expr ':' Body)* ('else' ':' Body)? -fn parse_when(input: &mut MultiPeek>) -> Result { todo!() } +fn parse_when(input: &mut Input, state: State) -> Result { + let mut branches = Vec::new(); + branches.push(parse_cond_branch(input, state)?); + while input.peek() == Some(&Key(Keyword::Elif)) { + input.next(); + branches.push(parse_cond_branch(input, state)?); + } + let mut else_body = None; + if input.peek() == Some(&Key(Keyword::Else)) { + input.next(); + else_body = Some(parse_body(input, state.indent())?); + } + let mut body = Vec::new(); + body.push(Expr::Control(If { branches, else_body })); + Ok(Expr::Control(Static { body })) +} // Try ::= 'try' ':' Body ('except' Ident (',' Ident)* ':' Body) ('finally' ':' Body)? -fn parse_try(input: &mut MultiPeek>) -> Result { todo!() } +fn parse_try(input: &mut Input, state: State) -> Result { + if input.next() != Some(Sep(Colon)) { + return Err("expected colon after try keyword".into()); + } + let body = parse_body(input, state.indent())?; + let catches = Vec::new(); + while input.peek() == Some(&Key(Keyword::Catch)) { + input.next(); + todo!(); + } + let mut finally = None; + if input.peek() == Some(&Key(Keyword::Finally)) { + input.next(); + if input.next() != Some(Sep(Colon)) { + return Err("expected colon after try keyword".into()); + } + finally = Some(parse_body(input, state.indent())?); + } + Ok(Expr::Control(Try { body, catches, finally })) +} // Match ::= 'match' Expr ('of' Pattern (',' Pattern)* ('where' Expr)? ':' Body)+ -fn parse_match(input: &mut MultiPeek>) -> Result { todo!() } +fn parse_match(input: &mut Input, state: State) -> Result { + let item = parse_pattern(input, state)?; + let mut branches = Vec::new(); + while input.peek() == Some(&Key(Keyword::Of)) { + input.next(); + todo!(); + } + Ok(Expr::Control(Match { item, branches })) +} -fn parse_line(input: &mut MultiPeek>) -> Result { todo!() } +fn parse_typedesc(input: &mut Input, state: State) -> Result { todo!() } +fn parse_pattern(input: &mut Input, state: State) -> Result { todo!() } +fn parse_cond_branch(input: &mut Input, state: State) -> Result { todo!() } // lex, parse, expand, compile? -- cgit v1.2.3-70-g09d2