diff options
Diffstat (limited to 'src/frontend/lex.rs')
-rw-r--r-- | src/frontend/lex.rs | 682 |
1 files changed, 372 insertions, 310 deletions
diff --git a/src/frontend/lex.rs b/src/frontend/lex.rs index e0aa76d..287ed9c 100644 --- a/src/frontend/lex.rs +++ b/src/frontend/lex.rs @@ -2,25 +2,41 @@ use multipeek::multipeek; use crate::*; -pub struct TokenStream(Vec<Token>); +/// We lex input into a TokenStream containing the position of the token and the Token itself. +pub struct TokenStream(Vec<(usize, Token)>); +/// Iterating over a TokenStream pops both the Tokens and the spans. +/// The parser only *really* cares about Tokens... spans are for use in error reporting. impl IntoIterator for TokenStream { - type Item = Token; - type IntoIter = std::vec::IntoIter<Token>; + type Item = (usize, Token); + type IntoIter = std::vec::IntoIter<(usize, Token)>; fn into_iter(self) -> Self::IntoIter { self.0.into_iter() } } +/// Basic syntax tokens. Form an unambiguous TokenStream. +#[derive(Clone, PartialEq)] +pub enum Token { + Key(Keyword), // keyword identifiers. ex. for, use, func... + Lit(Literal), // literal values. ex. for strings/comments. + Sep(Punctuation), // punctuation. non-word tokens. + Op(String), // operators. lexographically separate so the parser can parse infix notation + Num(String), // numeric values. ex. 413, 0b101011, 0xabcd, 612.1024 + Ident(String), // identifiers. variable names, function names, method calls... + Indent(usize), // indentation. denotes line breaks and scope at which a line starts + End // explicit scope end marker. inserted in normalization. +} + #[derive(Clone, PartialEq, Debug)] pub enum LexicalError { - InvalidIndentation, - MismatchedParens, - MismatchedBrackets, - UnknownPunctuation, + InvalidIndentation(usize), + MismatchedParens(usize), + MismatchedBrackets(usize), + UnknownPunctuation(usize), + EarlyEndOfInput, } - impl std::fmt::Display for LexicalError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{:?}", self) @@ -28,18 +44,7 @@ impl std::fmt::Display for LexicalError { } impl std::error::Error for LexicalError {} -/// **Basic** syntax tokens. Form an unambiguous TokenStream. -#[derive(Clone, PartialEq)] -pub enum Token { - Key(Keyword), // keyword identifiers. - Word(String), // non-keyword identifiers. - Op(String), // operators. separate tokens lexographically so the parser knows to parse infix notation. - Num(String), // numeric value, ex. 413, 0b101011, 0xabcd - Lit(Literal), // literal value, ex. for strings/comments. - Sep(Punctuation), // punctuation. non-word tokens. ~~operators are lexed as this and later transformed to words.~~ - Indent(usize), // indentation. denotes line breaks and scope at which a line starts. -} - +/// Literal text. This can be chars, strings, and comments. #[derive(Clone, PartialEq)] pub enum Literal { Char(String), @@ -51,7 +56,7 @@ pub enum Literal { } /// Keywords, made explicit for easier use with Rust. -/// (strings inside match patterns are fucky!!) +/// (static strings inside match patterns are fucky!!) #[derive(Clone, PartialEq)] pub enum Keyword { Pub, Let, Var, Const, @@ -86,8 +91,8 @@ pub enum Punctuation { GenericRightBracket, // ] ArrayLeftBracket, // [ ArrayRightBracket, // ] - ScopeLeftBrace, // { - ScopeRightBrace, // } + DependentLeftBrace, // { (not currently used) + DependentRightBrace, // } StructLeftBrace, // { StructRightBrace, // } Equals, // = @@ -110,320 +115,371 @@ pub enum Punctuation { Backslash, // \ } -/// Parses whitespace-sensitive code into an unambiguous TokenStream. -/// Also useful for formatting. -pub fn tokenize(input: &str) -> Result<TokenStream> { - // The design of this lexer utilizes to great extent multipeek's arbitrary peeking. - // Tokens are matched by looping within their case until complete. - // This then eliminates the need for most global parser state. (i hate state) +struct Input<'a>(multipeek::MultiPeek<std::str::CharIndices<'a>>); - use Token::*; - use Literal::*; - use Punctuation::*; - use LexicalError::*; - enum Paren { Func, Tuple } - enum Bracket { Generic, Array } - struct State { - start_of_line: bool, - paren_stack: Vec<Paren>, - bracket_stack: Vec<Bracket>, +impl<'a> Input<'a> { + /// Map input.next() to return Results for use with the propagation operator + fn next(&mut self) -> Result<char> { + self.0.next().map(|x| x.1).ok_or(EarlyEndOfInput.into()) + } + + fn peek_eat(&mut self, expected: char) -> bool { + if self.peek().is_ok_and(|x| x == expected) { + self.0.next(); + true + } else { + false + } } + fn peek(&mut self) -> Result<char> { + self.0.peek().copied().map(|x| x.1).ok_or(EarlyEndOfInput.into()) + } + + fn peek_nth(&mut self, n: usize) -> Result<char> { + self.0.peek_nth(n).copied().map(|x| x.1).ok_or(EarlyEndOfInput.into()) + } + + fn consume_indentation(&mut self) -> usize { + let mut count = 0; + while self.peek_eat(' ') { + count += 1; + } + if self.peek_eat('\n') { + self.consume_indentation() + } else { + count + } + } + + fn strip(&mut self) { + loop { + if !self.peek_eat(' ') { + break + } + } + } +} + +use Token::*; +use Literal::*; +use Punctuation::*; +use LexicalError::*; +enum Paren { Func, Tuple } +enum Bracket { Generic, Array } +struct State { + start_of_line: bool, + paren_stack: Vec<Paren>, + bracket_stack: Vec<Bracket>, + next_token_is_parameter_punctuation: bool, +} + +/// Parses whitespace-sensitive code into an unambiguous TokenStream. +pub fn lex(input: &str) -> Result<TokenStream> { + let mut input = Input(multipeek(input.char_indices())); + let mut res = TokenStream(Vec::new()); let mut state = State { start_of_line: true, paren_stack: vec!(), bracket_stack: vec!(), + next_token_is_parameter_punctuation: false, }; + res.0.push((0, Indent(input.consume_indentation()))); + loop { + input.strip(); + match input.0.next() { + Some((pos, c)) => res.0.push((pos, lex_token(&mut input, &mut state, pos, c)?)), + None => break + } + } + Ok(res) +} - let mut buf = String::new(); - let mut res = Vec::new(); - - // `char` in rust is four bytes it's fine - let mut input = multipeek(input.chars()); - while let Some(c) = input.next() { - match c { - ' ' => { // indentation! and whitespace - match res.last() { - Some(Indent(_)) => { // indentation! - res.pop(); // discard previous empty or useless Indent token - let mut current_indent_level = 1; - while let Some(x) = input.peek() { - match x { - ' ' => { current_indent_level += 1; input.next(); }, - _ => match res.last() { // indentation ends - Some(Word(a)) if a == "==" || a == "and" || a == "or" || - a == "xor" || a == "in" || a == "is" => break, - Some(Sep(FuncLeftParen)) | Some(Sep(TupleLeftParen)) | - Some(Sep(GenericLeftBracket)) | Some(Sep(ArrayLeftBracket)) | - Some(Sep(StructLeftBrace)) | Some(Sep(Comma)) => break, - _ => { - res.push(Indent(current_indent_level)); - break; - } - } - } - } - }, - _ => { // get rid of excess (all) whitespace between words/operators - while input.peek().is_some_and(|x| x.is_whitespace() && x != &'\n') { input.next(); } +fn lex_token(input: &mut Input, state: &mut State, pos: usize, c: char) -> Result<Token> { + Ok(match c { + '\n' => Indent(input.consume_indentation()), + '\t' => return Err(InvalidIndentation(pos).into()), + ' ' => unreachable!(), + ':' => Sep(Colon), + '=' => Sep(Equals), + '\'' => lex_char(input)?, + '"' => lex_string(input)?, + '#' => lex_comment(input)?, + c if c.is_alphabetic() || c == '_' => lex_identifier(input, state, c), // valid identifiers! + c if c.is_digit(10) => lex_number(input, c), // numeric literals! + '-' => { + // `-` is special. it can be the *prefix* operator "Negative", or part of a regular operator. + match input.peek()? { + ' ' => Sep(Minus), + _ => Sep(Negative) + } + }, + '(' => { + if state.next_token_is_parameter_punctuation { + state.next_token_is_parameter_punctuation = false; + state.paren_stack.push(Paren::Func); + Sep(FuncLeftParen) + } else { + state.paren_stack.push(Paren::Tuple); + Sep(TupleLeftParen) + } + }, + '[' => { + if state.next_token_is_parameter_punctuation { + state.next_token_is_parameter_punctuation = false; + state.bracket_stack.push(Bracket::Generic); + Sep(GenericLeftBracket) + } else { + state.bracket_stack.push(Bracket::Array); + Sep(ArrayLeftBracket) + } + }, + ')' => { + match state.paren_stack.pop() { + Some(Paren::Func) => Sep(FuncRightParen), + Some(Paren::Tuple) => Sep(TupleRightParen), + None => return Err(MismatchedParens(pos).into()), + } + }, + ']' => { + match state.bracket_stack.pop() { + Some(Bracket::Generic) => { + if let Ok('(') = input.peek() { + state.next_token_is_parameter_punctuation = true; } + Sep(GenericRightBracket) + }, + Some(Bracket::Array) => Sep(ArrayRightBracket), + None => return Err(MismatchedBrackets(pos).into()), + } + }, + '`' => { // backticks are used for injections, so generics/parameters may follow + // todo: SERIOUS ambiguity issue with `injection`[] + match input.peek() { + Ok('(') | Ok('[') => { + state.next_token_is_parameter_punctuation = true; + }, + _ => () + } + Sep(BackTick) + }, + ',' => Sep(Comma), + '.' => Sep(Period), + ';' => Sep(Semicolon), + '{' => Sep(StructLeftBrace), + '}' => Sep(StructRightBrace), + '+' => Sep(Plus), + '*' => Sep(Times), + '/' => Sep(Slash), + '<' => Sep(LessThan), + '>' => Sep(GreaterThan), + '@' => Sep(At), + '$' => Sep(Sha), + '~' => Sep(Tilde), + '&' => Sep(And), + '|' => Sep(Or), + '!' => Sep(Exclamation), + '?' => Sep(Question), + '^' => Sep(Caret), + '\\' => Sep(Backslash), + _ => return Err(UnknownPunctuation(pos).into()) + }) +} + +/// Char ::= '\'' <anything> '\'' +fn lex_char(input: &mut Input) -> Result<Token> { + let mut buf = String::new(); + loop { + match input.next()? { + '\'' => break, + '\\' => { + if let y = input.next()? { + buf.push('\\'); + buf.push(y); } }, - '\t' => return Err(InvalidIndentation.into()), - '\n' => res.push(Indent(0)), - '\'' => { // chars! - while let Some(x) = input.next() { - match x { - '\'' => break, - '\\' => if let Some(y) = input.next() { buf.push(y) }, - _ => buf.push(x) - } + x => buf.push(x) + } + } + Ok(Lit(Char(buf))) +} + +/// String ::= '"' <anything> '"' | '"""' <anything> '"""' +fn lex_string(input: &mut Input) -> Result<Token> { + let mut buf = String::new(); + match (input.peek_nth(0), input.peek_nth(1)) { + (Ok('"'), Ok('"')) => { // triple quoted strings + input.next()?; input.next()?; + loop { + match input.next()? { + '"' if input.peek_nth(0)? == '"' && + input.peek_nth(1)? == '"' => { + input.next()?; input.next()?; + break; + }, + x => buf.push(x) } - res.push(Lit(Char(String::from(&buf)))); - }, - '"' => { // strings! - match (input.peek_nth(0).copied(), input.peek_nth(1).copied()) { - (Some('"'), Some('"')) => { // triple quoted strings - input.next(); input.next(); - while let Some(x) = input.next() { - match x { - '"' if input.peek_nth(0) == Some(&'"') && - input.peek_nth(1) == Some(&'"') => { - input.next(); input.next(); - break; - }, - _ => buf.push(x) - } - } - res.push(Lit(MultiLineString(String::from(&buf)))); + } + Ok(Lit(MultiLineString(buf))) + }, + _ => { // single quoted strings + loop { + match input.next()? { + '"' => break, + '\\' => if let y = input.next()? { + buf.push(y) // FIXME }, - (_, _) => { // single quoted strings - while let Some(x) = input.next() { - match x { - '"' => break, - '\\' => if let Some(y) = input.next() { buf.push(y) }, - _ => buf.push(x) - } - } - res.push(Lit(SingleLineString(String::from(&buf)))); - } + x => buf.push(x) } - }, - '#' => { // comments! - match input.peek() { - Some('[') => { // block comment, can be nested - input.next(); - let mut comment_level = 1; - while let Some(x) = input.next() && comment_level > 0 { - match x { - '#' if input.peek() == Some(&'[') => { - comment_level += 1; - input.next(); - }, - ']' if input.peek() == Some(&'#') => { - comment_level -= 1; - input.next(); - }, - _ => buf.push(x) - } - } - res.push(Lit(MultiLineComment(String::from(&buf)))); + } + Ok(Lit(SingleLineString(buf))) + } + } +} + +/// Comment ::= '#' <anything> '\n' | '#[' Comment | <anything> ']#' +fn lex_comment(input: &mut Input) -> Result<Token> { + let mut buf = String::new(); + match input.peek() { + Ok('[') => { // block comment, can be nested + input.next()?; + let mut comment_level = 1; + while comment_level > 0 { + match input.next()? { + '#' if input.peek_eat('[') => { + comment_level += 1; + buf.push_str("#["); }, - Some(&'#') => { // documentation comment - input.next(); - while let Some(x) = input.peek() { - match x { - '\n' => break, - _ => { - buf.push(*x); - } - } - input.next(); - } - res.push(Lit(DocComment(String::from(&buf)))); + ']' if input.peek_eat('#') => { + comment_level -= 1; + buf.push_str("]#"); }, - _ => { // standard comment, runs til EOL - while let Some(x) = input.peek() { - match x { - '\n' => break, - _ => { - buf.push(*x); - } - } - input.next(); - } - res.push(Lit(Comment(String::from(&buf)))); - } - } - }, - c if c.is_alphabetic() || c == '_' => { // valid identifiers! - buf.push(c); - while let Some(x) = input.peek() { - match x { - x if x.is_alphanumeric() || x == &'_' => { - buf.push(*x); - input.next(); - }, - _ => { - use Keyword::*; - match buf.as_str() { // keywords! - "pub" => res.push(Key(Pub)), - "let" => res.push(Key(Let)), - "var" => res.push(Key(Var)), - "const" => res.push(Key(Const)), - "func" => res.push(Key(Func)), - "macro" => res.push(Key(Macro)), - "type" => res.push(Key(Type)), - "mod" => res.push(Key(Mod)), - "use" => res.push(Key(Use)), - "as" => res.push(Key(As)), - "for" => res.push(Key(For)), - "while" => res.push(Key(While)), - "loop" => res.push(Key(Loop)), - "block" => res.push(Key(Block)), - "static" => res.push(Key(Static)), - "if" => res.push(Key(If)), - "when" => res.push(Key(When)), - "elif" => res.push(Key(Elif)), - "else" => res.push(Key(Else)), - "match" => res.push(Key(Match)), - "where" => res.push(Key(Where)), - "try" => res.push(Key(Try)), - "catch" => res.push(Key(Catch)), - "finally" => res.push(Key(Finally)), - "struct" => res.push(Key(Struct)), - "tuple" => res.push(Key(Tuple)), - "enum" => res.push(Key(Enum)), - "union" => res.push(Key(Union)), - "interface" => res.push(Key(Interface)), - "distinct" => res.push(Key(Distinct)), - "ref" => res.push(Key(Ref)), - "ptr" => res.push(Key(Ptr)), - "mut" => res.push(Key(Mut)), - "break" => res.push(Key(Break)), - "continue" => res.push(Key(Continue)), - "return" => res.push(Key(Return)), - "in" => res.push(Key(In)), - "is" => res.push(Key(Is)), - "of" => res.push(Key(Of)), - _ => res.push(Word(String::from(&buf))) - } - match x { // () and [] denote both parameters/generics and tuples/arrays - '(' => { // we must disambiguate by treating those *directly* after words as such - res.push(Sep(FuncLeftParen)); - state.paren_stack.push(Paren::Func); - input.next(); - }, - '[' => { - res.push(Sep(GenericLeftBracket)); - state.bracket_stack.push(Bracket::Generic); - input.next(); - }, - _ => {}, - } - break; - } + x => buf.push(x) + }; + } + Ok(Lit(MultiLineComment(buf))) + }, + Ok('#') => { // documentation comment + input.next()?; + while let Ok(x) = input.peek() { + match x { + '\n' => break, + _ => { + buf.push(x); + input.next()?; } } - }, - '0'..='9' => { // numeric literals! - buf.push(c); - while let Some(x) = input.peek() { - match x { - 'a'..='z' | 'A'..='Z' | '0'..='9' | '_' => { - buf.push(*x); - input.next(); - }, - _ => break + } + Ok(Lit(DocComment(buf))) + }, + _ => { // standard comment, runs til EOL + while let Ok(x) = input.peek() { + match x { + '\n' => break, + _ => { + buf.push(x); + input.next()?; } } - res.push(Num(String::from(&buf))) - }, - '-' => { // `-` is special. it can be the *prefix* operator "Negative", or part of a regular operator. - match input.peek() { - Some(' ') => res.push(Sep(Minus)), - _ => res.push(Sep(Negative)) - } - }, - '(' => { // note: FuncParens were matched above, directly after identifiers - res.push(Sep(TupleLeftParen)); - state.paren_stack.push(Paren::Tuple); - }, - '[' => { // note: GenericBrackets were matched above, directly after identifiers - res.push(Sep(ArrayLeftBracket)); - state.bracket_stack.push(Bracket::Array); - }, - ')' => { // match parens - match state.paren_stack.pop() { - Some(Paren::Func) => res.push(Sep(FuncRightParen)), - Some(Paren::Tuple) => res.push(Sep(TupleRightParen)), - None => return Err(MismatchedParens.into()), - } - }, - ']' => { // match brackets - match state.bracket_stack.pop() { - Some(Bracket::Generic) => res.push(Sep(GenericRightBracket)), - Some(Bracket::Array) => res.push(Sep(ArrayRightBracket)), - None => return Err(MismatchedBrackets.into()), - } - if input.peek() == Some(&'(') { // parameters following generics - res.push(Sep(FuncLeftParen)); - state.paren_stack.push(Paren::Func); - input.next(); - } + } + Ok(Lit(Comment(buf))) + } + } +} + +#[allow(unused_assignments)] +fn lex_identifier(input: &mut Input, state: &mut State, c: char) -> Token { + use Keyword::*; + let mut buf = String::new(); + let mut keyword = None; + buf.push(c); + loop { + match input.peek() { + Ok(x) if x.is_alphanumeric() || x == '_' => { + buf.push(x); + let _ = input.next(); }, - '`' => { // backticks are used for operators, so generics/parameters may follow - res.push(Sep(BackTick)); // todo: backticks could like not be used for operators - match input.peek() { - Some('(') => { - res.push(Sep(FuncLeftParen)); - state.paren_stack.push(Paren::Func); - input.next(); - }, - Some('[') => { - res.push(Sep(GenericLeftBracket)); - state.bracket_stack.push(Bracket::Generic); - input.next(); - }, - _ => {} - } + _ => { + keyword = Some(match buf.as_str() { // keywords! + "pub" => Key(Pub), + "let" => Key(Let), + "var" => Key(Var), + "const" => Key(Const), + "func" => Key(Func), + "macro" => Key(Macro), + "type" => Key(Type), + "mod" => Key(Mod), + "use" => Key(Use), + "as" => Key(As), + "for" => Key(For), + "while" => Key(While), + "loop" => Key(Loop), + "block" => Key(Block), + "static" => Key(Static), + "if" => Key(If), + "when" => Key(When), + "elif" => Key(Elif), + "else" => Key(Else), + "match" => Key(Match), + "where" => Key(Where), + "try" => Key(Try), + "catch" => Key(Catch), + "finally" => Key(Finally), + "struct" => Key(Struct), + "tuple" => Key(Tuple), + "enum" => Key(Enum), + "union" => Key(Union), + "interface" => Key(Interface), + "distinct" => Key(Distinct), + "ref" => Key(Ref), + "ptr" => Key(Ptr), + "mut" => Key(Mut), + "in" => Key(In), + "is" => Key(Is), + "of" => Key(Of), + "break" => Key(Break), + "continue" => Key(Continue), + "return" => Key(Return), + _ => Ident(String::from(&buf)) + }); + break + } + } + } + // () and [] denote both parameters/generics and tuples/arrays + // we must disambiguate by treating those *directly* after words as such + match input.peek() { + Ok('(') | Ok('[') => state.next_token_is_parameter_punctuation = true, + _ => (), + } + keyword.unwrap() +} + +fn lex_number(input: &mut Input, c: char) -> Token { + let mut buf = String::new(); + buf.push(c); + loop { + match input.peek() { + Ok(x) if x.is_alphanumeric() => { // FIXME + buf.push(x); + let _ = input.next(); }, - ',' => res.push(Sep(Comma)), - '.' => res.push(Sep(Period)), - ';' => res.push(Sep(Semicolon)), - ':' => res.push(Sep(Colon)), - '{' => res.push(Sep(StructLeftBrace)), - '}' => res.push(Sep(StructRightBrace)), - '=' => res.push(Sep(Equals)), - '+' => res.push(Sep(Plus)), - '*' => res.push(Sep(Times)), - '/' => res.push(Sep(Slash)), - '<' => res.push(Sep(LessThan)), - '>' => res.push(Sep(GreaterThan)), - '@' => res.push(Sep(At)), - '$' => res.push(Sep(Sha)), - '~' => res.push(Sep(Tilde)), - '&' => res.push(Sep(And)), - '|' => res.push(Sep(Or)), - '!' => res.push(Sep(Exclamation)), - '?' => res.push(Sep(Question)), - '^' => res.push(Sep(Caret)), - '\\' => res.push(Sep(Backslash)), - _ => return Err(UnknownPunctuation.into()) + _ => break } - buf.clear(); } - Ok(TokenStream(res)) + Num(buf) } impl std::fmt::Display for TokenStream { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use Token::*; let mut prev_token = Indent(0); - for token in &self.0 { - match (&prev_token, &token) { - (Word(_), Word(_)) | (Word(_), Num(_)) | - (Num(_), Word(_)) | (Num(_), Num(_)) => write!(f, " {}", token)?, + for (_, token) in &self.0 { + match (prev_token, token) { // fixme + (Ident(_), Ident(_)) | (Ident(_), Num(_)) | + (Num(_), Ident(_)) | (Num(_), Num(_)) | + (Key(_), Key(_)) | (Key(_), Ident(_)) | (Ident(_), Key(_)) + => write!(f, " {}", token)?, _ => write!(f, "{}", token)?, } prev_token = token.clone(); @@ -437,11 +493,11 @@ impl std::fmt::Display for Token { use Token::*; match self { Key(word) => write!(f, "{}", word), - Word(val) => write!(f, "{}", val), - Num(val) => write!(f, "{}", val), + Ident(val) | Op(val) | Num(val) => write!(f, "{}", val), Lit(lit) => write!(f, "{}", lit), Sep(sep) => write!(f, "{}", sep), Indent(i) => write!(f, "\n{}", " ".repeat(*i)), + End => write!(f, "end"), } } } @@ -526,8 +582,8 @@ impl std::fmt::Display for Punctuation { GenericRightBracket => write!(f, "]"), ArrayLeftBracket => write!(f, " ["), ArrayRightBracket => write!(f, "]"), - ScopeLeftBrace => write!(f, "{{"), - ScopeRightBrace => write!(f, "}}"), + DependentLeftBrace => write!(f, "{{"), + DependentRightBrace => write!(f, "}}"), StructLeftBrace => write!(f, "{{"), StructRightBrace => write!(f, "}}"), Equals => write!(f, "="), @@ -551,3 +607,9 @@ impl std::fmt::Display for Punctuation { } } } + +pub fn test() { + let input = std::fs::read_to_string("std/ast.pk").unwrap(); + let lexed = lex(&input).unwrap(); + println!("{}", lexed); +} |