aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Cargo.lock7
-rw-r--r--Cargo.toml1
-rw-r--r--src/parser.rs214
3 files changed, 222 insertions, 0 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 964cc1d..c6a33ae 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6,10 +6,17 @@ version = 3
name = "chrysanthemum"
version = "0.1.0"
dependencies = [
+ "multipeek",
"peg",
]
[[package]]
+name = "multipeek"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d6b1cf1c2ae7c8c3898cbf8354ee836bc7037e35592d3739a9901d53c97b6a2"
+
+[[package]]
name = "peg"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/Cargo.toml b/Cargo.toml
index 079d964..7f83af2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,6 +6,7 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
+multipeek = "0.1.2"
peg = "0.8.1"
[features]
diff --git a/src/parser.rs b/src/parser.rs
index 95807c1..d3895a4 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -1,4 +1,5 @@
use crate::ast::*;
+use multipeek::multipeek;
/// Parses a lambda-calculus-like language into an AST.
pub fn parse_lambda(input: &str) -> Result<Expression, peg::error::ParseError<peg::str::LineCol>> {
@@ -100,3 +101,216 @@ pub fn parse_lambda(input: &str) -> Result<Expression, peg::error::ParseError<pe
return lambda::expr(input.trim());
}
+const operators: [char; 17] =
+ ['=', '+', '-', '*', '/', '<', '>', '@', '$', '~', '&', '%', '|', '!', '?', '^', '\\'];
+const brackets: [char; 6] = ['(', ')', '{', '}', '[', ']'];
+const special: [char; 7] = ['.', ',', ':', ';', '`', '\'', '"'];
+const keywords: [&'static str; 3] = ["if", "else", "func"];
+
+pub enum Token {
+ Operator(String),
+ Keyword(String),
+ Separator(String),
+ Identifier(String),
+ Value(String),
+ Char(char),
+ String(String),
+ Comment(String),
+ Token(String), // catch-all
+ ScopeBegin, // {
+ ScopeEnd, // }
+ ExprEnd, // ;
+}
+
+/// Properly lexes a whitespace-oriented language into a series of tokens.
+pub fn lex(input: &str) -> Result<Vec<Token>, &'static str> {
+ enum State {
+ Default,
+ Char,
+ String,
+ MultiLineString,
+ Comment,
+ }
+ struct Indentation {
+ blank: bool, // is the line entirely whitespace so far?
+ level: usize, // current indentation level
+ count: usize, // current whitespace count
+ }
+
+ let mut state = State::Default;
+ let mut indent = Indentation { blank: true, level: 0, count: 0 };
+ let mut buffer = String::new();
+ let mut result = Vec::new();
+
+ let mut input = multipeek(input.chars()); // multipeek my beloved
+ while let Some(c) = input.next() {
+ match state {
+ State::Default => match c {
+ ' ' if indent.blank => indent.count += 1,
+ ' ' if buffer.len() > 0 => {
+ result.push(parse_token(&buffer)?);
+ buffer.clear();
+ },
+ ' ' => todo!(),
+ '\n' => todo!(),
+ '\t' => return Err("Tabs are not supported!"),
+ '\'' => {
+ result.push(parse_token(&buffer)?);
+ buffer.clear();
+ if input.peek_nth(0) == Some(&'\\') || input.peek_nth(1) == Some(&'\'') {
+ state = State::Char;
+ } else {
+ result.push(Token::Separator("'".to_string()));
+ }
+ },
+ '"' => {
+ if input.peek_nth(0) == Some(&'\"') && input.peek_nth(1) == Some(&'\"') {
+ state = State::MultiLineString;
+ input.next();
+ input.next();
+ } else {
+ state = State::String;
+ }
+ },
+ '#' => {
+ state = State::Comment;
+ result.push(parse_token(&buffer)?);
+ buffer.clear();
+ },
+ _ if brackets.contains(&c) || special.contains(&c) => {
+ if buffer.len() > 0 {
+ result.push(parse_token(&buffer)?);
+ buffer.clear();
+ }
+ result.push(Token::Separator(c.to_string()));
+ if indent.blank {
+ indent.blank = false;
+ }
+ }
+ _ if indent.blank => {
+ indent.blank = false;
+ // indentation check
+ todo!();
+ buffer.push(c);
+ }
+ _ => buffer.push(c)
+ },
+ State::Char => match c {
+ '\\' => {
+ match input.next() {
+ Some('\\') => result.push(Token::Char('\\')),
+ Some('0') => result.push(Token::Char('\0')),
+ Some('n') => result.push(Token::Char('\n')),
+ Some('r') => result.push(Token::Char('\r')),
+ Some('t') => result.push(Token::Char('\t')),
+ Some('\"') => result.push(Token::Char('\"')),
+ Some('\'') => result.push(Token::Char('\'')),
+ _ => return Err("Invalid string escape sequence!"),
+ }
+ state = State::Default;
+ if input.next() != Some('\'') {
+ return Err("Invalid character sequence!")
+ }
+ },
+ '\'' => {
+ result.push(Token::Char('\0'));
+ state = State::Default;
+ }
+ _ => {
+ result.push(Token::Char(c));
+ state = State::Default;
+ if input.next() != Some('\'') {
+ return Err("Invalid character sequence!")
+ }
+ }
+ },
+ State::String => match c {
+ '\\' => match input.next() {
+ Some('\\') => buffer.push('\\'),
+ Some('0') => buffer.push('\0'),
+ Some('n') => buffer.push('\n'),
+ Some('r') => buffer.push('\r'),
+ Some('t') => buffer.push('\t'),
+ Some('\"') => buffer.push('\"'),
+ Some('\'') => buffer.push('\''),
+ _ => return Err("Invalid string escape sequence!"),
+ },
+ '\"' => {
+ state = State::Default;
+ result.push(Token::String(buffer.to_string()));
+ buffer.clear();
+ }
+ _ => buffer.push(c)
+ },
+ State::MultiLineString => match c {
+ '\"' if input.peek_nth(0) == Some(&'"') && input.peek_nth(1) == Some(&'"') => {
+ state = State::Default;
+ result.push(Token::String(buffer.to_string()));
+ buffer.clear();
+ input.next();
+ input.next();
+ },
+ _ => buffer.push(c)
+ },
+ State::Comment => match c {
+ '\n' => {
+ state = State::Default;
+ result.push(Token::Comment(buffer.to_string()));
+ },
+ _ => buffer.push(c)
+ },
+ }
+ }
+ return Ok(result);
+}
+
+fn parse_token(token: &str) -> Result<Token, &'static str> {
+ if keywords.contains(&token) {
+ Ok(Token::Keyword(token.to_string()))
+ } else if is_operator(token) {
+ Ok(Token::Operator(token.to_string()))
+ } else if is_value(token) {
+ Ok(Token::Value(token.to_string()))
+ } else if is_identifier(token) {
+ Ok(Token::Identifier(token.to_string()))
+ } else {
+ Err("Could not parse token!")
+ }
+}
+
+fn is_operator(token: &str) -> bool {
+ for c in token.chars() {
+ if !operators.contains(&c) {
+ return false;
+ }
+ }
+ return true;
+}
+
+fn is_value(token: &str) -> bool {
+ if token == "true" || token == "false" {
+ return true;
+ }
+ // fixme: hex literals etc
+ for c in token.chars() {
+ // note size annotations are separately lexed
+ if !c.is_numeric() {
+ return false;
+ }
+ }
+ return true;
+}
+
+fn is_identifier(token: &str) -> bool {
+ if let Some(c) = token.chars().nth(0) {
+ if c.is_numeric() || c == '_' {
+ return false;
+ }
+ }
+ for c in token.chars() {
+ if !c.is_alphanumeric() && c != '_' {
+ return false;
+ }
+ }
+ return true;
+}