aboutsummaryrefslogtreecommitdiff
path: root/src/main/model/html/HtmlLexer.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/main/model/html/HtmlLexer.java')
-rw-r--r--src/main/model/html/HtmlLexer.java68
1 files changed, 68 insertions, 0 deletions
diff --git a/src/main/model/html/HtmlLexer.java b/src/main/model/html/HtmlLexer.java
new file mode 100644
index 0000000..8cad425
--- /dev/null
+++ b/src/main/model/html/HtmlLexer.java
@@ -0,0 +1,68 @@
+package model.html;
+
+import java.util.ArrayList;
+
+/**
+ * We'll tokenize HTML by tags: disregarding the contents of the tag and attributes within the tag.
+ * The file is also considered to be free-form here: whitespace duplicates are disregarded.
+ */
+public class HtmlLexer {
+
+ // Takes a String of raw HTML, and tokenizes it for our parser.
+ public static ArrayList<String> lex(String input) {
+ String token = "";
+ ArrayList<String> tokens = new ArrayList<>();
+ boolean inTag = false;
+ boolean inSingleQuotes = false;
+ boolean inDoubleQuotes = false;
+
+ for (char i : input.toCharArray()) {
+ token += i;
+ switch (i) {
+ case '<':
+ if (!inSingleQuotes && !inDoubleQuotes) {
+ inTag = true;
+ if (!token.equals("<")) {
+ tokens.add(token.substring(0, token.length() - 1));
+ token = "<";
+ }
+ } else if (inTag) {
+ System.out.printf("Probably failing parser");
+ }
+ break;
+ case '>':
+ if (!inSingleQuotes && !inDoubleQuotes) {
+ if (!inTag) {
+ System.out.printf("Probably failing parser");
+ }
+ inTag = false;
+ tokens.add(token);
+ token = "";
+ }
+ break;
+ case '"':
+ if (!inSingleQuotes) {
+ inDoubleQuotes = !inDoubleQuotes;
+ }
+ break;
+ case '\'':
+ if (!inDoubleQuotes) {
+ inSingleQuotes = !inSingleQuotes;
+ }
+ break;
+ }
+ }
+ /**
+ * When lexing invalid HTML: we may end up with trailing garbage: either an unfinished tag or extra text
+ * (those are the only two options since this is just the lex step)
+ */
+ if (!token.equals("")) {
+ if (inTag) {
+ tokens.add(token + ">");
+ } else {
+ tokens.add(token);
+ }
+ }
+ return tokens;
+ }
+}