aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJJ2023-10-26 10:34:03 +0000
committerJJ2023-10-26 10:45:32 +0000
commit3a25fa06d2cd9d2b89b68622415d7168cf06c0da (patch)
tree82d614370c6673275887401d398d5ed59321b056 /src
parent7473fb09ed89e5e2fde8a7d6e89bc5d28151bdae (diff)
compiler: unicode support, reconsider indentation handling
Diffstat (limited to 'src')
-rw-r--r--src/lex.rs101
1 files changed, 35 insertions, 66 deletions
diff --git a/src/lex.rs b/src/lex.rs
index d3d5a7e..7e31476 100644
--- a/src/lex.rs
+++ b/src/lex.rs
@@ -8,6 +8,7 @@ pub enum LexicalError {
InvalidIndentation,
MismatchedParens,
MismatchedBrackets,
+ UnknownPunctuation,
}
impl std::fmt::Display for LexicalError {
@@ -24,7 +25,7 @@ pub enum Token {
Num(String), // numeric value, ex. 413, 0b101011, 0xabcd
Lit(Literal), // literal value, ex. for strings/comments.
Sep(Punctuation), // punctuation. non-word tokens. operators are lexed as this and later transformed to words.
- Begin, End, Newline // scope indicators. can i use trees instead? should i use trees instead?
+ Indent(usize), // indentation. denotes line breaks and scope at which a line starts.
}
#[derive(Clone, PartialEq)]
@@ -93,16 +94,12 @@ pub fn tokenize(input: &str) -> Result<TokenStream> {
enum Bracket { Generic, Array }
struct State {
start_of_line: bool,
- indent_level: isize,
- indent_width: isize,
paren_stack: Vec<Paren>,
bracket_stack: Vec<Bracket>,
}
let mut state = State {
start_of_line: true,
- indent_level: 0,
- indent_width: 0,
paren_stack: vec!(),
bracket_stack: vec!(),
};
@@ -114,66 +111,35 @@ pub fn tokenize(input: &str) -> Result<TokenStream> {
let mut input = multipeek(input.chars());
while let Some(c) = input.next() {
match c {
- ' ' => {
- if state.start_of_line { // indentation
- let mut current_indent_level = 1;
- while let Some(x) = input.peek() {
- match x {
- ' ' => current_indent_level += 1,
- '\n' => break, // empty line
- _ => { // indentation ends
- // really gross. this just checks if the previous token was a newline,
- // and that the token before it was punctuation or a known "operator",
- // and if so disregards indentation and treats it as a line continuation.
- if let Some(Newline) = res.get(res.len() - 1) {
- if let Some(prev) = res.get(res.len() - 2) {
- match prev { // all keywords and punctuation that may continue a line
- // workaround for https://github.com/rust-lang/rust/issues/87121
- Word(a) if a == "==" || a == "and" || a == "or" ||
- a == "xor" || a == "in" || a == "is" => {
- res.pop();
- break;
- },
- &Sep(FuncLeftParen) | Sep(GenericLeftBracket) | Sep(StructLeftBrace) |
- Sep(TupleLeftParen) | Sep(ArrayLeftBracket) | Sep(Comma) => {
- res.pop();
- break;
- }
- _ => {}
- }
+ ' ' => { // indentation! and whitespace
+ match res.last() {
+ Some(Indent(_)) => { // indentation!
+ res.pop(); // discard previous empty or useless Indent token
+ let mut current_indent_level = 1;
+ while let Some(x) = input.peek() {
+ match x {
+ ' ' => current_indent_level += 1,
+ _ => match res.last() { // indentation ends
+ Some(Word(a)) if a == "==" || a == "and" || a == "or" ||
+ a == "xor" || a == "in" || a == "is" => break,
+ Some(Sep(FuncLeftParen)) | Some(Sep(TupleLeftParen)) |
+ Some(Sep(GenericLeftBracket)) | Some(Sep(ArrayLeftBracket)) |
+ Some(Sep(StructLeftBrace)) | Some(Sep(Comma)) => break,
+ _ => {
+ res.push(Indent(current_indent_level));
+ break;
}
}
-
- // will only fire once. allows us to support X number of spaces so long as it's consistent
- if state.indent_width == 0 {
- state.indent_width = current_indent_level;
- }
-
- if current_indent_level % state.indent_width != 0 {
- return Err(InvalidIndentation.into());
- }
-
- let diff = (current_indent_level - state.indent_level) / state.indent_width;
- match diff {
- 0 => (), // same level of indentation
- 1 => res.push(Begin), // new level of indentation
- -1 => res.push(End), // old level of indentation
- _ => return Err(InvalidIndentation.into()) // todo: support indentation in exprs
- }
- state.indent_level = current_indent_level;
- break;
}
}
+ },
+ _ => { // get rid of excess (all) whitespace between words/operators
+ while input.peek().is_some_and(|x| x.is_whitespace() && x != &'\n') { input.next(); }
}
- } else { // get rid of excess (all) whitespace
- while input.peek() == Some(&' ') { input.next(); }
}
},
'\t' => return Err(InvalidIndentation.into()),
- '\n' => {
- state.start_of_line = true;
- res.push(Newline)
- },
+ '\n' => res.push(Indent(0)),
'\'' => { // chars!
while let Some(x) = input.next() {
match x {
@@ -190,8 +156,8 @@ pub fn tokenize(input: &str) -> Result<TokenStream> {
input.next(); input.next();
while let Some(x) = input.next() {
match x {
- '"' if input.peek_nth(1) == Some(&'"') &&
- input.peek_nth(2) == Some(&'"') => {
+ '"' if input.peek_nth(0) == Some(&'"') &&
+ input.peek_nth(1) == Some(&'"') => {
input.next(); input.next();
break;
},
@@ -257,13 +223,11 @@ pub fn tokenize(input: &str) -> Result<TokenStream> {
}
}
},
- 'a'..='z' | 'A'..='Z' | '_' => { // valid identifiers!
- buf.push(c); // todo: unicode support
+ c if c.is_alphabetic() || c == '_' => { // valid identifiers!
+ buf.push(c);
while let Some(x) = input.next() {
match x {
- 'a'..='z' | 'A'..='Z' | '0'..='9' | '_' => {
- buf.push(x);
- },
+ x if x.is_alphanumeric() || x == '_' => buf.push(x),
_ => {
res.push(Word(String::from(&buf)));
match x { // () and [] denote both parameters/generics and tuples/arrays
@@ -285,7 +249,7 @@ pub fn tokenize(input: &str) -> Result<TokenStream> {
'0'..='9' => { // numeric literals!
buf.push(c);
while let Some(x) = input.next() {
- match x { // todo: unicode support
+ match x {
'a'..='z' | 'A'..='Z' | '0'..='9' | '_' => {
buf.push(x);
input.next();
@@ -322,6 +286,11 @@ pub fn tokenize(input: &str) -> Result<TokenStream> {
Some(Bracket::Array) => res.push(Sep(ArrayRightBracket)),
None => return Err(MismatchedBrackets.into()),
}
+ if input.peek() == Some(&'[') { // parameters following generics
+ res.push(Sep(FuncLeftParen));
+ state.paren_stack.push(Paren::Func);
+ input.next();
+ }
},
',' => res.push(Sep(Comma)),
'.' => res.push(Sep(Period)),
@@ -345,7 +314,7 @@ pub fn tokenize(input: &str) -> Result<TokenStream> {
'?' => res.push(Sep(Question)),
'^' => res.push(Sep(Caret)),
'\\' => res.push(Sep(Backslash)),
- _ => return Err("unknown character".into()) // todo: support unicode!
+ _ => return Err(UnknownPunctuation.into())
}
buf.clear();
}