src/lex.rs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190

use multipeek::multipeek;

pub type Result<T> = core::result::Result<T, Box<dyn std::error::Error>>;
pub type TokenStream = Vec<Token>;

/// **Basic** syntax tokens. Form an unambiguous TokenStream.
#[derive(Clone, PartialEq)]
pub enum Token {
    Word(String),   // identifiers.
    Lit(String),    // literal value, ex. for strings/comments.
    Sep(char),      // punctuation. non-word tokens.
    Begin, End      // scope indicators.
}

/// All keywords that may continue a line. For knowing valid line splits.
const valid_continuations: [&str; 7] = ["and", "or", "xor", "in", "notin", "is", "isnot"];

/// Parses whitespace-sensitive code into an unambiguous TokenStream.
/// Also useful for formatting.
// todo: support indentation within expressions
// nim: "As a rule of thumb, indentation within expressions is
// allowed after operators, an open parenthesis and after commas."
pub fn tokenize(input: &str) -> Result<TokenStream> {
    // The design of this lexer utilizes to great extent multipeek's arbitrary peeking.
    // Tokens are matched by looping within their case until complete.
    // This then eliminates the need for almost all global parser state.

    use Token::*;
    let mut start_of_line = true;   // state
    let mut indent_level = 0;       // state
    let mut indent_width = None;    // state
    let mut buf = String::new();    // buffer
    let mut res = Vec::new();       // result

    // `char` in rust is four bytes it's fine
    let mut input = multipeek(input.chars());
    while let Some(c) = input.next() {
        match c {
            ' ' => {
                if start_of_line { // indentation
                    let mut current_indent_level = 1;
                    while let Some(x) = input.peek() {
                        match x {
                            ' ' => current_indent_level += 1,
                            '\n' => break, // empty line
                            _ => { // indentation ends
                                // really gross. this just checks if the previous token was a newline,
                                // and that the token before it was punctuation or a known "operator",
                                // and if so disregards indentation and treats it as a line continuation.
                                if let Some(&Sep('\n')) = res.get(res.len() - 1) {
                                    if let Some(y) = res.get(res.len() - 2) {
                                        if let Word(z) = y {
                                            if valid_continuations.contains(&&z[..]) {
                                                res.pop();
                                                break;
                                            }
                                        } else if let Sep(_) = y {
                                            res.pop();
                                            break;
                                        }
                                    }
                                }

                                // will only fire once. allows us to support X number of spaces so long as it's consistent
                                if indent_width.is_none() {
                                    indent_width = Some(current_indent_level);
                                }

                                let indent_width = indent_width.unwrap(); // safe. see above
                                if current_indent_level % indent_width != 0 {
                                    return Err("indentation is offset".into());
                                }

                                let diff = (current_indent_level as isize - indent_level as isize) / indent_width as isize;
                                match diff {
                                    0 => (),                // same level of indentation
                                    1 => res.push(Begin),   // new level of indentation
                                    -1 => res.push(End),    // old level of indentation
                                    _ => return Err("indentation stepped by too much in one go".into())
                                }
                                indent_level = current_indent_level;
                                break;
                            }
                        }
                    }
                } else { // get rid of excess (all) whitespace
                    while input.peek() == Some(&' ') { input.next(); }
                }
            },
            '\n' => { // newlines are separators
                start_of_line = true;
                res.push(Sep('\n'))
            },
            c if c.is_whitespace() => return Err("tabs etc are not supported".into()),
            '\'' => { // single quoted strings, i.e. chars
                res.push(Sep('\''));
                while let Some(x) = input.next() {
                    match x {
                        '\'' => break,
                        '\\' => if let Some(y) = input.next() { buf.push(y) },
                        _ => buf.push(x)
                    }
                }
                res.push(Lit(String::from(&buf)));
                res.push(Sep('\''));
            },
            '"' => { // triple quoted strings
                if input.peek_nth(0) == Some(&'"') &&
                   input.peek_nth(1) == Some(&'"') {
                    input.next(); input.next();
                    res.push(Sep('"')); res.push(Sep('"')); res.push(Sep('"'));
                    while let Some(x) = input.next() {
                        match x {
                            '"' if input.peek_nth(1) == Some(&'"') &&
                                   input.peek_nth(2) == Some(&'"') => {
                                break;
                           },
                           _ => buf.push(x)
                        }
                    }
                    res.push(Lit(String::from(&buf)));
                    input.next(); input.next();
                    res.push(Sep('"')); res.push(Sep('"')); res.push(Sep('"'));
                } else { // regular strings
                    res.push(Sep('"'));
                    while let Some(x) = input.next() {
                        match x {
                            '"' => break,
                            '\\' => if let Some(y) = input.next() { buf.push(y) },
                            _ => buf.push(x)
                        }
                    }
                    res.push(Lit(String::from(&buf)));
                    res.push(Sep('"'));
                }
            },
            '#' => { // block comment, can be nested
                if input.peek() == Some(&'[') {
                    input.next();
                    res.push(Sep('#')); res.push(Sep('['));
                    let mut comment_level = 1;
                    while let Some(x) = input.next() && comment_level > 0 {
                        match x {
                            '#' if input.peek() == Some(&'[') => {
                                comment_level += 1;
                                input.next();
                            },
                            ']' if input.peek() == Some(&'#') => {
                                comment_level -= 1;
                                input.next();
                            },
                            _ => buf.push(x)
                        }
                    }
                    res.push(Lit(String::from(&buf)));
                    res.push(Sep(']')); res.push(Sep('#'));
                } else { // standard comment, runs until eol
                    res.push(Sep('#'));
                    while let Some(x) = input.peek() {
                        match x {
                            '\n' => break,
                            _ => {
                                buf.push(*x);
                                input.next();
                            }
                        }
                    }
                    res.push(Lit(String::from(&buf)));
                }
            },
            'a'..'z' | 'A'..'Z' | '0'..'9' | '_' => { // valid identifier
                while let Some(x) = input.peek() {
                    match x {
                        'a'..'z' | 'A'..'Z' | '0'..'9' | '_' => {
                            buf.push(*x);
                            input.next();
                        },
                        _ => break
                    }
                }
                res.push(Word(String::from(&buf)));
            },
            '.' | ',' | ':' | ';' | // punctuation
            '(' | ')' | '[' | ']' | '{' | '}' => res.push(Sep(c)),
            _ => res.push(Sep(c))   // for now: treat unknown chars as Sep
        }
        buf.clear();
    }
    return Ok(res);
}