aboutsummaryrefslogtreecommitdiff
path: root/src/main/model/html/HtmlParser.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/main/model/html/HtmlParser.java')
-rw-r--r--src/main/model/html/HtmlParser.java490
1 files changed, 288 insertions, 202 deletions
diff --git a/src/main/model/html/HtmlParser.java b/src/main/model/html/HtmlParser.java
index d6b4ff1..e9dc0c4 100644
--- a/src/main/model/html/HtmlParser.java
+++ b/src/main/model/html/HtmlParser.java
@@ -17,229 +17,317 @@ import org.javatuples.*;
*/
public class HtmlParser {
+ /**
+ * HTML is not nice to parse. We manage to get away with a relatively small number of parser states regardless.
+ */
private enum ParserState {
HTML, IGNORED,
- OPENING_TAG, KEY, VALUE,
- SINGLE_QUOTE, DOUBLE_QUOTE,
+ OPENING_TAG, KEY, VALUE, // TAG::OPENING_TAG, TAG::KEY, TAG::VALUE
+ SINGLE_QUOTES, DOUBLE_QUOTES, // VALUE::SINGLE_QUOTES, VALUE::DOUBLE_QUOTES
UNKNOWN_TAG, CLOSING_TAG,
}
- public static ArrayList<Node> parseHtmlLL(String input) {
+ // HTML documents are uniquely a list of Nodes rather than a Node themselves
+ private ArrayList<Node> result;
+ // a bunch of useful buffers. see CssParser for commentary.
+ private ArrayDeque<ElementNode> unfinished;
+ private String currentTag;
+ private ArrayList<Pair<String, String>> currentAttributes;
+ private String currentKey;
+ private String currentValue;
+ private String currentText;
+ // important for quote escapes, and multiple whitespace chars
+ private char previousChar;
- var result = new ArrayList<Node>();
- var unfinished = new ArrayDeque<ElementNode>();
- var currentTag = "";
- var currentAttributes = new ArrayList<Pair<String, String>>();
- var currentKey = "";
- var currentValue = "";
- var currentText = "";
- var previousChar = '\0'; // important for quote escapes, and multiple whitespace chars
+ private ParserState state;
+
+ public HtmlParser() {
+ result = new ArrayList<>();
+ unfinished = new ArrayDeque<>();
+ currentTag = "";
+ currentAttributes = new ArrayList<>();
+ currentKey = "";
+ currentValue = "";
+ currentText = "";
+ previousChar = '\0';
// We safely? assume to start outside of all nodes.
- ParserState state = ParserState.HTML;
+ state = ParserState.HTML;
+ }
+
+ public ArrayList<Node> parseHtml(String input) {
for (char c : input.toCharArray()) {
// System.out.print(state);
// System.out.println(" " + c + " " + currentText);
switch (state) {
- case HTML:
- switch (c) {
- case '<':
- state = ParserState.UNKNOWN_TAG;
- if (!currentText.equals("")) {
- if (unfinished.size() != 0) {
- unfinished.getLast().addChild(new TextNode(currentText));
- } else {
- result.add(new TextNode(currentText));
- }
- currentText = "";
- previousChar = '\0';
- }
- break; // FOOTGUN LANGUAGE DESIGN
- case ' ': case '\n':
- if (previousChar != ' ') {
- currentText += ' ';
- }
- previousChar = ' ';
- break;
- default:
- currentText += c;
- previousChar = c;
- break;
- }
+ case HTML: caseHtml(c);
break;
- case UNKNOWN_TAG:
- switch (c) {
- case '/':
- state = ParserState.CLOSING_TAG;
- break;
- case '>': // Why would you put <> in your HTML??? go away
- state = ParserState.HTML;
- currentText += "<>";
- System.out.println("Why would you put <> in your HTML??? go away");
- break;
- // For now, we'll straight-up ignore anything matching the <!...> syntax:
- // i.e. comments, and <!DOCTYPE html>
- case '!':
- state = ParserState.IGNORED;
- break;
- default:
- state = ParserState.OPENING_TAG;
- currentTag += c;
- break;
- }
+ case UNKNOWN_TAG: caseUnknownTag(c);
break; // FOOTGUN LANGUAGE DESIGN STRIKES AGAIN
- case IGNORED:
- switch (c) {
- case '>':
- state = ParserState.HTML;
- break;
- default:
- break;
- }
+ case IGNORED: caseIgnored(c);
break;
- case OPENING_TAG:
- switch (c) {
- case '>':
- state = ParserState.HTML;
- var node = new ElementNode(currentTag, currentAttributes);
- if (unfinished.size() != 0) {
- unfinished.getLast().addChild(node);
- unfinished.add(node);
- } else {
- result.add(node);
- unfinished.add((ElementNode) result.get(result.size() - 1));
- }
- currentTag = "";
- currentAttributes = new ArrayList<>();
- break;
- case ' ': case '\n':
- state = ParserState.KEY;
- break;
- default:
- currentTag += c;
- break;
- }
+ case OPENING_TAG: caseOpeningTag(c);
break;
- case CLOSING_TAG:
- switch (c) {
- case '>':
- state = ParserState.HTML;
- // IMPORTANT: we don't validate that closing tags correspond to an open tag
- if (!isSelfClosingTag(currentTag)) {
- if (unfinished.size() != 0) {
- unfinished.removeLast();
- }
- }
- currentTag = "";
- break;
- case ' ': case '\n':
- break;
- default:
- currentTag += c;
- break;
- }
+ case CLOSING_TAG: caseClosingTag(c);
break;
- case KEY:
- switch (c) {
- case '>':
- state = ParserState.HTML;
- var node = new ElementNode(currentTag, currentAttributes);
- if (unfinished.size() != 0) {
- unfinished.getLast().addChild(node);
- unfinished.add(node);
- } else {
- result.add(node);
- unfinished.add((ElementNode) result.get(result.size() - 1));
- }
- currentTag = "";
- currentAttributes = new ArrayList<>();
- break;
- case '=':
- state = ParserState.VALUE;
- break;
- case ' ': case '\n':
- break;
- default:
- currentKey += c;
- break;
- }
+ case KEY: caseKey(c);
break;
- case VALUE:
- switch (c) {
- case '\'':
- state = ParserState.SINGLE_QUOTE;
- break;
- case '\"':
- state = ParserState.DOUBLE_QUOTE;
- break;
- case ' ': case '\n':
- currentAttributes.add(new Pair<>(currentKey, currentValue));
- currentKey = "";
- currentValue = "";
- case '>':
- if (!currentKey.equals("") || !currentValue.equals("")) {
- currentAttributes.add(new Pair<>(currentKey, currentValue));
- currentKey = "";
- currentValue = "";
- }
- state = ParserState.HTML;
- var node = new ElementNode(currentTag, currentAttributes);
- if (unfinished.size() != 0) {
- unfinished.getLast().addChild(node);
- unfinished.add(node);
- } else {
- result.add(node);
- unfinished.add((ElementNode) result.get(result.size() - 1));
- }
- currentTag = "";
- currentAttributes = new ArrayList<>();
- break;
- default:
- currentValue += c;
- break;
- }
+ case VALUE: caseValue(c);
break;
- case SINGLE_QUOTE:
- switch (c) {
- case '\'':
- if (previousChar != '\\') {
- state = ParserState.VALUE;
- previousChar = '\0';
- } else {
- currentValue = currentValue.substring(0, currentValue.length() - 2);
- currentValue += c;
- previousChar = c;
- }
- break;
- default:
- currentValue += c;
- previousChar = c;
- break;
- }
+ case SINGLE_QUOTES: caseSingleQuotes(c);
break;
- case DOUBLE_QUOTE:
- switch (c) {
- case '\"':
- if (previousChar != '\\') {
- state = ParserState.VALUE;
- previousChar = '\0';
- } else {
- currentValue = currentValue.substring(0, currentValue.length() - 2);
- currentValue += c;
- previousChar = c;
- }
- default:
- currentValue += c;
- previousChar = c;
- break;
- }
+ case DOUBLE_QUOTES: caseDoubleQuotes(c);
break;
}
}
return result;
}
+ /**
+ * EFFECTS: Handles and updates parser state/buffers for a single character while in the HTML state.
+ * MODIFIES: this
+ */
+ private void caseHtml(char c) {
+ switch (c) {
+ case '<':
+ state = ParserState.UNKNOWN_TAG;
+ if (!currentText.equals("")) {
+ addNewTextNode();
+ }
+ break; // FOOTGUN LANGUAGE DESIGN
+ case ' ': case '\n':
+ if (previousChar != ' ') {
+ currentText += ' ';
+ }
+ previousChar = ' ';
+ break;
+ default:
+ currentText += c;
+ previousChar = c;
+ break;
+ }
+ }
+
+ /**
+ * EFFECTS: Handles and updates parser state/buffers for a single character while in the UNKNOWN_TAG state.
+ * MODIFIES: this
+ */
+ private void caseUnknownTag(char c) {
+ switch (c) {
+ case '/':
+ state = ParserState.CLOSING_TAG;
+ break;
+ case '>': // Why would you put <> in your HTML??? go away
+ state = ParserState.HTML;
+ currentText += "<>";
+ break;
+ // For now, we'll straight-up ignore anything matching the <!...> syntax:
+ // i.e. comments, and <!DOCTYPE html>
+ case '!':
+ state = ParserState.IGNORED;
+ break;
+ default:
+ state = ParserState.OPENING_TAG;
+ currentTag += c;
+ break;
+ }
+ }
+
+ /**
+ * EFFECTS: Handles and updates parser state/buffers for a single character while in the IGNORED state.
+ * MODIFIES: this
+ */
+ private void caseIgnored(char c) {
+ switch (c) {
+ case '>':
+ state = ParserState.HTML;
+ break;
+ default:
+ break;
+ }
+ }
+
+ /**
+ * EFFECTS: Handles and updates parser state/buffers for a single character while in the OPENING_TAG state.
+ * MODIFIES: this
+ */
+ private void caseOpeningTag(char c) {
+ switch (c) {
+ case '>':
+ addNewElementNode();
+ break;
+ case ' ': case '\n':
+ state = ParserState.KEY;
+ break;
+ default:
+ currentTag += c;
+ break;
+ }
+ }
+
+ /**
+ * EFFECTS: Handles and updates parser state/buffers for a single character while in the CLOSING_TAG state.
+ * MODIFIES: this
+ */
+ private void caseClosingTag(char c) {
+ switch (c) {
+ case '>':
+ state = ParserState.HTML;
+ // IMPORTANT: we don't validate that closing tags correspond to an open tag
+ if (!isSelfClosingTag(currentTag)) {
+ if (unfinished.size() != 0) {
+ unfinished.removeLast();
+ }
+ }
+ currentTag = "";
+ break;
+ case ' ': case '\n':
+ break;
+ default:
+ currentTag += c;
+ break;
+ }
+ }
+
+ /**
+ * EFFECTS: Handles and updates parser state/buffers for a single character while in the KEY state.
+ * MODIFIES: this
+ */
+ private void caseKey(char c) {
+ switch (c) {
+ case '>':
+ addNewElementNode();
+ break;
+ case '=':
+ state = ParserState.VALUE;
+ break;
+ case ' ': case '\n':
+ break;
+ default:
+ currentKey += c;
+ break;
+ }
+ }
+
+ /**
+ * EFFECTS: Handles and updates parser state/buffers for a single character while in the VALUE state.
+ * MODIFIES: this
+ */
+ private void caseValue(char c) {
+ switch (c) {
+ case '\'':
+ state = ParserState.SINGLE_QUOTES;
+ break;
+ case '\"':
+ state = ParserState.DOUBLE_QUOTES;
+ break;
+ case ' ': case '\n':
+ currentAttributes.add(new Pair<>(currentKey, currentValue));
+ currentKey = "";
+ currentValue = "";
+ case '>':
+ if (!currentKey.equals("") || !currentValue.equals("")) {
+ currentAttributes.add(new Pair<>(currentKey, currentValue));
+ currentKey = "";
+ currentValue = "";
+ }
+ addNewElementNode();
+ break;
+ default:
+ currentValue += c;
+ break;
+ }
+ }
+
+ /**
+ * EFFECTS: Handles and updates parser state/buffers for a single character while in the SINGLE_QUOTES state.
+ * MODIFIES: this
+ */
+ private void caseSingleQuotes(char c) {
+ switch (c) {
+ case '\'':
+ if (previousChar != '\\') {
+ state = ParserState.VALUE;
+ previousChar = '\0';
+ } else {
+ currentValue = currentValue.substring(0, currentValue.length() - 2);
+ currentValue += c;
+ previousChar = c;
+ }
+ break;
+ default:
+ currentValue += c;
+ previousChar = c;
+ break;
+ }
+ }
+
+ /**
+ * EFFECTS: Handles and updates parser state/buffers for a single character while in the DOUBLE_QUOTES state.
+ * MODIFIES: this
+ */
+ private void caseDoubleQuotes(char c) {
+ switch (c) {
+ case '\"':
+ if (previousChar != '\\') {
+ state = ParserState.VALUE;
+ previousChar = '\0';
+ } else {
+ currentValue = currentValue.substring(0, currentValue.length() - 2);
+ currentValue += c;
+ previousChar = c;
+ }
+ break; // FOOTGUN LANGUAGE DESIGN
+ default:
+ currentValue += c;
+ previousChar = c;
+ break;
+ }
+ }
+
+ /**
+ * Helper function to remove code duplication.
+ * EFFECTS: Creates and adds a new ElementNode from the current buffers to the unfinished and result stacks
+ * MODIFIES: this
+ */
+ private void addNewElementNode() {
+ state = ParserState.HTML;
+ var node = new ElementNode(currentTag, currentAttributes);
+ if (unfinished.size() != 0) {
+ unfinished.getLast().addChild(node);
+ if (!isSelfClosingTag(currentTag)) {
+ unfinished.add(node);
+ }
+ } else {
+ result.add(node);
+ if (!isSelfClosingTag(currentTag)) {
+ unfinished.add((ElementNode) result.get(result.size() - 1));
+ }
+ }
+ currentTag = "";
+ currentAttributes = new ArrayList<>();
+ }
+
+ /**
+ * Helper function to check method length boxes.
+ * EFFECTS: Creates and adds a new TextNode from the current buffers to the unfinished and result stacks
+ * MODIFIES: this
+ */
+ private void addNewTextNode() {
+ if (unfinished.size() != 0) {
+ unfinished.getLast().addChild(new TextNode(currentText));
+ } else {
+ result.add(new TextNode(currentText));
+ }
+ currentText = "";
+ previousChar = '\0';
+ }
+
+ /**
+ * Simple helper function to check if a tag is self-closing.
+ * EFFECTS: Returns whether a String tag is a self-closing tag.
+ */
private static boolean isSelfClosingTag(String tag) {
switch (tag) {
case "input": case "param":
@@ -281,7 +369,7 @@ public class HtmlParser {
<div id="intro">
<img id="face" src="assets/compass.jpg"/>
</div>
- <!-- <div id="details">
+ <div id="details">
<h2>Projects</h2>
<p> Lorem ipsum dolor sit amet, consectetur adipiscing elit,
sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
@@ -291,7 +379,7 @@ public class HtmlParser {
dolore eu fugiat nulla pariatur.
Excepteur sint occaecat cupidatat non proident, sunt in culpa
qui officia deserunt mollit anim id est laborum. </p>
- <h2>Posts</h2>
+ <!-- <h2>Posts</h2>
<p> Lorem ipsum dolor sit amet, consectetur adipiscing elit,
sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris
@@ -299,14 +387,12 @@ public class HtmlParser {
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum
dolore eu fugiat nulla pariatur.
Excepteur sint occaecat cupidatat non proident, sunt in culpa
- qui officia deserunt mollit anim id est laborum. </p>
- </div> -->
+ qui officia deserunt mollit anim id est laborum. </p> -->
+ </div>
</main>
<footer>
<span><img src="assets/copyleft.svg" width="12" height="12"/> 2020-2022 j-james </span>
</footer>
</body>
</html>
-<!--
-
*/