package model.html; import java.util.*; import org.javatuples.*; /** * This class represents the state of and implements an LL(1) HTML parser. * For convenience, the following (defo wrong) context-free grammar for HTML is below. *
* HTML ::= '' (NODE)* * NODE ::= '<'TAG (' ' WORD '=' ('"'TEXT'"' | TEXT))* '>' (NODE)* '<\/' TAG '>' * | '<'SELF_CLOSING_TAG (' ' WORD '=' ('"'TEXT'"' | TEXT))* ('>'|'/>') * | (TEXT | NODE)* * TEXT ::= UNICODE - {'"'} + {'\"'} * TAG ::= 'body' | 'div' | ... * SELF_CLOSING_TAG ::= 'img' | ... * (note that \forall T \in SELF_CLOSING_TAG, T \notin TAG) */ public class HtmlParser { /** * HTML is not nice to parse. We manage to get away with a relatively small number of parser states regardless. */ private enum ParserState { HTML, IGNORED, OPENING_TAG, KEY, VALUE, // TAG::OPENING_TAG, TAG::KEY, TAG::VALUE SINGLE_QUOTES, DOUBLE_QUOTES, // VALUE::SINGLE_QUOTES, VALUE::DOUBLE_QUOTES UNKNOWN_TAG, CLOSING_TAG, } // HTML documents are uniquely a list of Nodes rather than a Node themselves private ArrayList result; // a bunch of useful buffers. see CssParser for commentary. private ArrayDeque unfinished; private String currentTag; private ArrayList> currentAttributes; private String currentKey; private String currentValue; private String currentText; // important for quote escapes, and multiple whitespace chars private char previousChar; private ParserState state; public HtmlParser() { result = new ArrayList<>(); unfinished = new ArrayDeque<>(); currentTag = ""; currentAttributes = new ArrayList<>(); currentKey = ""; currentValue = ""; currentText = ""; previousChar = '\0'; // We safely? assume to start outside of all nodes. state = ParserState.HTML; } public ArrayList parseHtml(String input) { for (char c : input.toCharArray()) { switch (state) { case HTML -> caseHtml(c); case UNKNOWN_TAG -> caseUnknownTag(c); case IGNORED -> caseIgnored(c); case OPENING_TAG -> caseOpeningTag(c); case CLOSING_TAG -> caseClosingTag(c); case KEY -> caseKey(c); case VALUE -> caseValue(c); case SINGLE_QUOTES -> caseSingleQuotes(c); case DOUBLE_QUOTES -> caseDoubleQuotes(c); } } return result; } private void caseHtml(char c) { switch (c) { case '<' -> { state = ParserState.UNKNOWN_TAG; if (!currentText.equals("")) { addNewTextNode(); } } case ' ', '\t', '\n' -> { if (previousChar != ' ') { currentText += ' '; } previousChar = ' '; } default -> { currentText += c; previousChar = c; } } } private void caseUnknownTag(char c) { switch (c) { case '/' -> state = ParserState.CLOSING_TAG; case '>' -> { // Why would you put <> in your HTML??? go away state = ParserState.HTML; currentText += "<>"; } // For now, we'll straight up ignore anything matching the syntax: // i.e. comments, and case '!' -> state = ParserState.IGNORED; default -> { state = ParserState.OPENING_TAG; currentTag += c; } } } private void caseIgnored(char c) { switch (c) { case '>' -> state = ParserState.HTML; default -> {} } } private void caseOpeningTag(char c) { switch (c) { case '>' -> addNewElementNode(); case ' ', '\t', '\n' -> state = ParserState.KEY; default -> currentTag += c; } } private void caseClosingTag(char c) { switch (c) { case '>' -> { state = ParserState.HTML; // IMPORTANT: we don't validate that closing tags correspond to an open tag if (!isSelfClosingTag(currentTag)) { if (unfinished.size() != 0) { unfinished.removeLast(); } } currentTag = ""; } case ' ', '\t', '\n' -> {} default -> currentTag += c; } } private void caseKey(char c) { switch (c) { case '>' -> addNewElementNode(); case '=' -> state = ParserState.VALUE; case ' ', '\t', '\n' -> {} default -> currentKey += c; } } private void caseValue(char c) { switch (c) { case '\'' -> state = ParserState.SINGLE_QUOTES; case '\"' -> state = ParserState.DOUBLE_QUOTES; case ' ', '\t', '\n' -> { state = ParserState.KEY; currentAttributes.add(new Pair<>(currentKey, currentValue)); currentKey = ""; currentValue = ""; } case '>' -> { if (!currentKey.equals("") || !currentValue.equals("")) { currentAttributes.add(new Pair<>(currentKey, currentValue)); currentKey = ""; currentValue = ""; } addNewElementNode(); } default -> currentValue += c; } } private void caseSingleQuotes(char c) { switch (c) { case '\'' -> { if (previousChar != '\\') { state = ParserState.VALUE; previousChar = '\0'; } else { currentValue = currentValue.substring(0, currentValue.length() - 1); currentValue += c; previousChar = c; } } default -> { currentValue += c; previousChar = c; } } } private void caseDoubleQuotes(char c) { switch (c) { case '\"' -> { if (previousChar != '\\') { state = ParserState.VALUE; previousChar = '\0'; } else { currentValue = currentValue.substring(0, currentValue.length() - 1); currentValue += c; previousChar = c; } } default -> { currentValue += c; previousChar = c; } } } // Helper function to remove code duplication. private void addNewElementNode() { state = ParserState.HTML; var node = new ElementNode(currentTag, currentAttributes); if (unfinished.size() != 0) { unfinished.getLast().children.add(node); if (!isSelfClosingTag(currentTag)) { unfinished.add(node); } } else { result.add(node); if (!isSelfClosingTag(currentTag)) { unfinished.add((ElementNode) result.get(result.size() - 1)); } } currentTag = ""; currentAttributes = new ArrayList<>(); } // Helper function to check method length boxes. private void addNewTextNode() { if (!currentText.equals(" ")) { // fixme if (unfinished.size() != 0) { unfinished.getLast().children.add(new TextNode(currentText)); } else { result.add(new TextNode(currentText)); } } currentText = ""; previousChar = '\0'; } // Simple helper function to check if a tag is self-closing. private static boolean isSelfClosingTag(String tag) { return switch (tag) { case "input", "param", "br", "hr", "wbr", "img", "embed", "area", "meta", "base", "link", "source", "track", "col" -> true; default -> false; }; } } /* j-james

j-james

Projects

Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.