diff options
Diffstat (limited to 'src/main/model/html/HtmlLexer.java')
-rw-r--r-- | src/main/model/html/HtmlLexer.java | 68 |
1 files changed, 68 insertions, 0 deletions
diff --git a/src/main/model/html/HtmlLexer.java b/src/main/model/html/HtmlLexer.java new file mode 100644 index 0000000..8cad425 --- /dev/null +++ b/src/main/model/html/HtmlLexer.java @@ -0,0 +1,68 @@ +package model.html; + +import java.util.ArrayList; + +/** + * We'll tokenize HTML by tags: disregarding the contents of the tag and attributes within the tag. + * The file is also considered to be free-form here: whitespace duplicates are disregarded. + */ +public class HtmlLexer { + + // Takes a String of raw HTML, and tokenizes it for our parser. + public static ArrayList<String> lex(String input) { + String token = ""; + ArrayList<String> tokens = new ArrayList<>(); + boolean inTag = false; + boolean inSingleQuotes = false; + boolean inDoubleQuotes = false; + + for (char i : input.toCharArray()) { + token += i; + switch (i) { + case '<': + if (!inSingleQuotes && !inDoubleQuotes) { + inTag = true; + if (!token.equals("<")) { + tokens.add(token.substring(0, token.length() - 1)); + token = "<"; + } + } else if (inTag) { + System.out.printf("Probably failing parser"); + } + break; + case '>': + if (!inSingleQuotes && !inDoubleQuotes) { + if (!inTag) { + System.out.printf("Probably failing parser"); + } + inTag = false; + tokens.add(token); + token = ""; + } + break; + case '"': + if (!inSingleQuotes) { + inDoubleQuotes = !inDoubleQuotes; + } + break; + case '\'': + if (!inDoubleQuotes) { + inSingleQuotes = !inSingleQuotes; + } + break; + } + } + /** + * When lexing invalid HTML: we may end up with trailing garbage: either an unfinished tag or extra text + * (those are the only two options since this is just the lex step) + */ + if (!token.equals("")) { + if (inTag) { + tokens.add(token + ">"); + } else { + tokens.add(token); + } + } + return tokens; + } +} |