aboutsummaryrefslogtreecommitdiff
path: root/src/main/model/html/HtmlLexer.java
blob: 8cad425e17dfc733555bbd1bada9319a65e1706d (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
package model.html;

import java.util.ArrayList;

/**
 * We'll tokenize HTML by tags: disregarding the contents of the tag and attributes within the tag.
 * The file is also considered to be free-form here: whitespace duplicates are disregarded.
 */
public class HtmlLexer {

    // Takes a String of raw HTML, and tokenizes it for our parser.
    public static ArrayList<String> lex(String input) {
        String token = "";
        ArrayList<String> tokens = new ArrayList<>();
        boolean inTag = false;
        boolean inSingleQuotes = false;
        boolean inDoubleQuotes = false;

        for (char i : input.toCharArray()) {
            token += i;
            switch (i) {
                case '<':
                    if (!inSingleQuotes && !inDoubleQuotes) {
                        inTag = true;
                        if (!token.equals("<")) {
                            tokens.add(token.substring(0, token.length() - 1));
                            token = "<";
                        }
                    } else if (inTag) {
                        System.out.printf("Probably failing parser");
                    }
                    break;
                case '>':
                    if (!inSingleQuotes && !inDoubleQuotes) {
                        if (!inTag) {
                            System.out.printf("Probably failing parser");
                        }
                        inTag = false;
                        tokens.add(token);
                        token = "";
                    }
                    break;
                case '"':
                    if (!inSingleQuotes) {
                        inDoubleQuotes = !inDoubleQuotes;
                    }
                    break;
                case '\'':
                    if (!inDoubleQuotes) {
                        inSingleQuotes = !inSingleQuotes;
                    }
                    break;
            }
        }
        /**
         * When lexing invalid HTML: we may end up with trailing garbage: either an unfinished tag or extra text
         * (those are the only two options since this is just the lex step)
         */
        if (!token.equals("")) {
            if (inTag) {
                tokens.add(token + ">");
            } else {
                tokens.add(token);
            }
        }
        return tokens;
    }
}