package model.html;
import java.util.ArrayList;
/**
* We'll tokenize HTML by tags: disregarding the contents of the tag and attributes within the tag.
* The file is also considered to be free-form here: whitespace duplicates are disregarded.
*/
public class HtmlLexer {
// Takes a String of raw HTML, and tokenizes it for our parser.
public static ArrayList lex(String input) {
String token = "";
ArrayList tokens = new ArrayList<>();
boolean inTag = false;
boolean inSingleQuotes = false;
boolean inDoubleQuotes = false;
for (char i : input.toCharArray()) {
token += i;
switch (i) {
case '<':
if (!inSingleQuotes && !inDoubleQuotes) {
inTag = true;
if (!token.equals("<")) {
tokens.add(token.substring(0, token.length() - 1));
token = "<";
}
} else if (inTag) {
System.out.printf("Probably failing parser");
}
break;
case '>':
if (!inSingleQuotes && !inDoubleQuotes) {
if (!inTag) {
System.out.printf("Probably failing parser");
}
inTag = false;
tokens.add(token);
token = "";
}
break;
case '"':
if (!inSingleQuotes) {
inDoubleQuotes = !inDoubleQuotes;
}
break;
case '\'':
if (!inDoubleQuotes) {
inSingleQuotes = !inSingleQuotes;
}
break;
}
}
/**
* When lexing invalid HTML: we may end up with trailing garbage: either an unfinished tag or extra text
* (those are the only two options since this is just the lex step)
*/
if (!token.equals("")) {
if (inTag) {
tokens.add(token + ">");
} else {
tokens.add(token);
}
}
return tokens;
}
}