diff options
Diffstat (limited to 'src/main/model/html/HtmlParser.java')
-rw-r--r-- | src/main/model/html/HtmlParser.java | 490 |
1 files changed, 288 insertions, 202 deletions
diff --git a/src/main/model/html/HtmlParser.java b/src/main/model/html/HtmlParser.java index d6b4ff1..e9dc0c4 100644 --- a/src/main/model/html/HtmlParser.java +++ b/src/main/model/html/HtmlParser.java @@ -17,229 +17,317 @@ import org.javatuples.*; */ public class HtmlParser { + /** + * HTML is not nice to parse. We manage to get away with a relatively small number of parser states regardless. + */ private enum ParserState { HTML, IGNORED, - OPENING_TAG, KEY, VALUE, - SINGLE_QUOTE, DOUBLE_QUOTE, + OPENING_TAG, KEY, VALUE, // TAG::OPENING_TAG, TAG::KEY, TAG::VALUE + SINGLE_QUOTES, DOUBLE_QUOTES, // VALUE::SINGLE_QUOTES, VALUE::DOUBLE_QUOTES UNKNOWN_TAG, CLOSING_TAG, } - public static ArrayList<Node> parseHtmlLL(String input) { + // HTML documents are uniquely a list of Nodes rather than a Node themselves + private ArrayList<Node> result; + // a bunch of useful buffers. see CssParser for commentary. + private ArrayDeque<ElementNode> unfinished; + private String currentTag; + private ArrayList<Pair<String, String>> currentAttributes; + private String currentKey; + private String currentValue; + private String currentText; + // important for quote escapes, and multiple whitespace chars + private char previousChar; - var result = new ArrayList<Node>(); - var unfinished = new ArrayDeque<ElementNode>(); - var currentTag = ""; - var currentAttributes = new ArrayList<Pair<String, String>>(); - var currentKey = ""; - var currentValue = ""; - var currentText = ""; - var previousChar = '\0'; // important for quote escapes, and multiple whitespace chars + private ParserState state; + + public HtmlParser() { + result = new ArrayList<>(); + unfinished = new ArrayDeque<>(); + currentTag = ""; + currentAttributes = new ArrayList<>(); + currentKey = ""; + currentValue = ""; + currentText = ""; + previousChar = '\0'; // We safely? assume to start outside of all nodes. - ParserState state = ParserState.HTML; + state = ParserState.HTML; + } + + public ArrayList<Node> parseHtml(String input) { for (char c : input.toCharArray()) { // System.out.print(state); // System.out.println(" " + c + " " + currentText); switch (state) { - case HTML: - switch (c) { - case '<': - state = ParserState.UNKNOWN_TAG; - if (!currentText.equals("")) { - if (unfinished.size() != 0) { - unfinished.getLast().addChild(new TextNode(currentText)); - } else { - result.add(new TextNode(currentText)); - } - currentText = ""; - previousChar = '\0'; - } - break; // FOOTGUN LANGUAGE DESIGN - case ' ': case '\n': - if (previousChar != ' ') { - currentText += ' '; - } - previousChar = ' '; - break; - default: - currentText += c; - previousChar = c; - break; - } + case HTML: caseHtml(c); break; - case UNKNOWN_TAG: - switch (c) { - case '/': - state = ParserState.CLOSING_TAG; - break; - case '>': // Why would you put <> in your HTML??? go away - state = ParserState.HTML; - currentText += "<>"; - System.out.println("Why would you put <> in your HTML??? go away"); - break; - // For now, we'll straight-up ignore anything matching the <!...> syntax: - // i.e. comments, and <!DOCTYPE html> - case '!': - state = ParserState.IGNORED; - break; - default: - state = ParserState.OPENING_TAG; - currentTag += c; - break; - } + case UNKNOWN_TAG: caseUnknownTag(c); break; // FOOTGUN LANGUAGE DESIGN STRIKES AGAIN - case IGNORED: - switch (c) { - case '>': - state = ParserState.HTML; - break; - default: - break; - } + case IGNORED: caseIgnored(c); break; - case OPENING_TAG: - switch (c) { - case '>': - state = ParserState.HTML; - var node = new ElementNode(currentTag, currentAttributes); - if (unfinished.size() != 0) { - unfinished.getLast().addChild(node); - unfinished.add(node); - } else { - result.add(node); - unfinished.add((ElementNode) result.get(result.size() - 1)); - } - currentTag = ""; - currentAttributes = new ArrayList<>(); - break; - case ' ': case '\n': - state = ParserState.KEY; - break; - default: - currentTag += c; - break; - } + case OPENING_TAG: caseOpeningTag(c); break; - case CLOSING_TAG: - switch (c) { - case '>': - state = ParserState.HTML; - // IMPORTANT: we don't validate that closing tags correspond to an open tag - if (!isSelfClosingTag(currentTag)) { - if (unfinished.size() != 0) { - unfinished.removeLast(); - } - } - currentTag = ""; - break; - case ' ': case '\n': - break; - default: - currentTag += c; - break; - } + case CLOSING_TAG: caseClosingTag(c); break; - case KEY: - switch (c) { - case '>': - state = ParserState.HTML; - var node = new ElementNode(currentTag, currentAttributes); - if (unfinished.size() != 0) { - unfinished.getLast().addChild(node); - unfinished.add(node); - } else { - result.add(node); - unfinished.add((ElementNode) result.get(result.size() - 1)); - } - currentTag = ""; - currentAttributes = new ArrayList<>(); - break; - case '=': - state = ParserState.VALUE; - break; - case ' ': case '\n': - break; - default: - currentKey += c; - break; - } + case KEY: caseKey(c); break; - case VALUE: - switch (c) { - case '\'': - state = ParserState.SINGLE_QUOTE; - break; - case '\"': - state = ParserState.DOUBLE_QUOTE; - break; - case ' ': case '\n': - currentAttributes.add(new Pair<>(currentKey, currentValue)); - currentKey = ""; - currentValue = ""; - case '>': - if (!currentKey.equals("") || !currentValue.equals("")) { - currentAttributes.add(new Pair<>(currentKey, currentValue)); - currentKey = ""; - currentValue = ""; - } - state = ParserState.HTML; - var node = new ElementNode(currentTag, currentAttributes); - if (unfinished.size() != 0) { - unfinished.getLast().addChild(node); - unfinished.add(node); - } else { - result.add(node); - unfinished.add((ElementNode) result.get(result.size() - 1)); - } - currentTag = ""; - currentAttributes = new ArrayList<>(); - break; - default: - currentValue += c; - break; - } + case VALUE: caseValue(c); break; - case SINGLE_QUOTE: - switch (c) { - case '\'': - if (previousChar != '\\') { - state = ParserState.VALUE; - previousChar = '\0'; - } else { - currentValue = currentValue.substring(0, currentValue.length() - 2); - currentValue += c; - previousChar = c; - } - break; - default: - currentValue += c; - previousChar = c; - break; - } + case SINGLE_QUOTES: caseSingleQuotes(c); break; - case DOUBLE_QUOTE: - switch (c) { - case '\"': - if (previousChar != '\\') { - state = ParserState.VALUE; - previousChar = '\0'; - } else { - currentValue = currentValue.substring(0, currentValue.length() - 2); - currentValue += c; - previousChar = c; - } - default: - currentValue += c; - previousChar = c; - break; - } + case DOUBLE_QUOTES: caseDoubleQuotes(c); break; } } return result; } + /** + * EFFECTS: Handles and updates parser state/buffers for a single character while in the HTML state. + * MODIFIES: this + */ + private void caseHtml(char c) { + switch (c) { + case '<': + state = ParserState.UNKNOWN_TAG; + if (!currentText.equals("")) { + addNewTextNode(); + } + break; // FOOTGUN LANGUAGE DESIGN + case ' ': case '\n': + if (previousChar != ' ') { + currentText += ' '; + } + previousChar = ' '; + break; + default: + currentText += c; + previousChar = c; + break; + } + } + + /** + * EFFECTS: Handles and updates parser state/buffers for a single character while in the UNKNOWN_TAG state. + * MODIFIES: this + */ + private void caseUnknownTag(char c) { + switch (c) { + case '/': + state = ParserState.CLOSING_TAG; + break; + case '>': // Why would you put <> in your HTML??? go away + state = ParserState.HTML; + currentText += "<>"; + break; + // For now, we'll straight-up ignore anything matching the <!...> syntax: + // i.e. comments, and <!DOCTYPE html> + case '!': + state = ParserState.IGNORED; + break; + default: + state = ParserState.OPENING_TAG; + currentTag += c; + break; + } + } + + /** + * EFFECTS: Handles and updates parser state/buffers for a single character while in the IGNORED state. + * MODIFIES: this + */ + private void caseIgnored(char c) { + switch (c) { + case '>': + state = ParserState.HTML; + break; + default: + break; + } + } + + /** + * EFFECTS: Handles and updates parser state/buffers for a single character while in the OPENING_TAG state. + * MODIFIES: this + */ + private void caseOpeningTag(char c) { + switch (c) { + case '>': + addNewElementNode(); + break; + case ' ': case '\n': + state = ParserState.KEY; + break; + default: + currentTag += c; + break; + } + } + + /** + * EFFECTS: Handles and updates parser state/buffers for a single character while in the CLOSING_TAG state. + * MODIFIES: this + */ + private void caseClosingTag(char c) { + switch (c) { + case '>': + state = ParserState.HTML; + // IMPORTANT: we don't validate that closing tags correspond to an open tag + if (!isSelfClosingTag(currentTag)) { + if (unfinished.size() != 0) { + unfinished.removeLast(); + } + } + currentTag = ""; + break; + case ' ': case '\n': + break; + default: + currentTag += c; + break; + } + } + + /** + * EFFECTS: Handles and updates parser state/buffers for a single character while in the KEY state. + * MODIFIES: this + */ + private void caseKey(char c) { + switch (c) { + case '>': + addNewElementNode(); + break; + case '=': + state = ParserState.VALUE; + break; + case ' ': case '\n': + break; + default: + currentKey += c; + break; + } + } + + /** + * EFFECTS: Handles and updates parser state/buffers for a single character while in the VALUE state. + * MODIFIES: this + */ + private void caseValue(char c) { + switch (c) { + case '\'': + state = ParserState.SINGLE_QUOTES; + break; + case '\"': + state = ParserState.DOUBLE_QUOTES; + break; + case ' ': case '\n': + currentAttributes.add(new Pair<>(currentKey, currentValue)); + currentKey = ""; + currentValue = ""; + case '>': + if (!currentKey.equals("") || !currentValue.equals("")) { + currentAttributes.add(new Pair<>(currentKey, currentValue)); + currentKey = ""; + currentValue = ""; + } + addNewElementNode(); + break; + default: + currentValue += c; + break; + } + } + + /** + * EFFECTS: Handles and updates parser state/buffers for a single character while in the SINGLE_QUOTES state. + * MODIFIES: this + */ + private void caseSingleQuotes(char c) { + switch (c) { + case '\'': + if (previousChar != '\\') { + state = ParserState.VALUE; + previousChar = '\0'; + } else { + currentValue = currentValue.substring(0, currentValue.length() - 2); + currentValue += c; + previousChar = c; + } + break; + default: + currentValue += c; + previousChar = c; + break; + } + } + + /** + * EFFECTS: Handles and updates parser state/buffers for a single character while in the DOUBLE_QUOTES state. + * MODIFIES: this + */ + private void caseDoubleQuotes(char c) { + switch (c) { + case '\"': + if (previousChar != '\\') { + state = ParserState.VALUE; + previousChar = '\0'; + } else { + currentValue = currentValue.substring(0, currentValue.length() - 2); + currentValue += c; + previousChar = c; + } + break; // FOOTGUN LANGUAGE DESIGN + default: + currentValue += c; + previousChar = c; + break; + } + } + + /** + * Helper function to remove code duplication. + * EFFECTS: Creates and adds a new ElementNode from the current buffers to the unfinished and result stacks + * MODIFIES: this + */ + private void addNewElementNode() { + state = ParserState.HTML; + var node = new ElementNode(currentTag, currentAttributes); + if (unfinished.size() != 0) { + unfinished.getLast().addChild(node); + if (!isSelfClosingTag(currentTag)) { + unfinished.add(node); + } + } else { + result.add(node); + if (!isSelfClosingTag(currentTag)) { + unfinished.add((ElementNode) result.get(result.size() - 1)); + } + } + currentTag = ""; + currentAttributes = new ArrayList<>(); + } + + /** + * Helper function to check method length boxes. + * EFFECTS: Creates and adds a new TextNode from the current buffers to the unfinished and result stacks + * MODIFIES: this + */ + private void addNewTextNode() { + if (unfinished.size() != 0) { + unfinished.getLast().addChild(new TextNode(currentText)); + } else { + result.add(new TextNode(currentText)); + } + currentText = ""; + previousChar = '\0'; + } + + /** + * Simple helper function to check if a tag is self-closing. + * EFFECTS: Returns whether a String tag is a self-closing tag. + */ private static boolean isSelfClosingTag(String tag) { switch (tag) { case "input": case "param": @@ -281,7 +369,7 @@ public class HtmlParser { <div id="intro"> <img id="face" src="assets/compass.jpg"/> </div> - <!-- <div id="details"> + <div id="details"> <h2>Projects</h2> <p> Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. @@ -291,7 +379,7 @@ public class HtmlParser { dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. </p> - <h2>Posts</h2> + <!-- <h2>Posts</h2> <p> Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris @@ -299,14 +387,12 @@ public class HtmlParser { Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa - qui officia deserunt mollit anim id est laborum. </p> - </div> --> + qui officia deserunt mollit anim id est laborum. </p> --> + </div> </main> <footer> <span><img src="assets/copyleft.svg" width="12" height="12"/> 2020-2022 j-james </span> </footer> </body> </html> -<!-- - */ |