package model.html; import java.util.*; import model.util.Node; import org.javatuples.*; /* * HTML ::= '' (NODE)* * NODE ::= '<'TAG (' ' WORD '=' ('"'TEXT'"' | TEXT))* '>' (NODE)* '' * | '<'SINGLE_TAG (' ' WORD '=' ('"'TEXT'"' | TEXT))* ('>'|'/>') * | (TEXT | NODE)* * TEXT ::= UNICODE - {'"'} + {'\"'} * TAG ::= 'body' | 'div' | ... * SINGLE_TAG ::= 'img' | ... * (note that \forall T \in SINGLE_TAG, T \notin TAG) */ public class HtmlParser { private enum ParserState { HTML, IGNORED, OPENING_TAG, KEY, VALUE, SINGLE_QUOTE, DOUBLE_QUOTE, UNKNOWN_TAG, CLOSING_TAG, } public static ArrayList parseHtmlLL(String input) { var result = new ArrayList(); var unfinished = new ArrayDeque(); var currentTag = ""; var currentAttributes = new ArrayList>(); var currentKey = ""; var currentValue = ""; var currentText = ""; var previousChar = '\0'; // important for quote escapes, and multiple whitespace chars // We safely? assume to start outside of all nodes. ParserState state = ParserState.HTML; for (char c : input.toCharArray()) { // System.out.print(state); // System.out.println(" " + c + " " + currentText); switch (state) { case HTML: switch (c) { case '<': state = ParserState.UNKNOWN_TAG; if (!currentText.equals("")) { if (unfinished.size() != 0) { unfinished.getLast().addChild(new TextNode(currentText)); } else { result.add(new TextNode(currentText)); } currentText = ""; previousChar = '\0'; } break; // FOOTGUN LANGUAGE DESIGN case ' ': case '\n': if (previousChar != ' ') { currentText += ' '; } previousChar = ' '; break; default: currentText += c; previousChar = c; break; } break; case UNKNOWN_TAG: switch (c) { case '/': state = ParserState.CLOSING_TAG; break; case '>': // Why would you put <> in your HTML??? go away state = ParserState.HTML; currentText += "<>"; System.out.println("Why would you put <> in your HTML??? go away"); break; // For now, we'll straight-up ignore anything matching the syntax: // i.e. comments, and case '!': state = ParserState.IGNORED; break; default: state = ParserState.OPENING_TAG; currentTag += c; break; } break; // FOOTGUN LANGUAGE DESIGN STRIKES AGAIN case IGNORED: switch (c) { case '>': state = ParserState.HTML; break; default: break; } break; case OPENING_TAG: switch (c) { case '>': state = ParserState.HTML; var node = new ElementNode(currentTag, currentAttributes); if (unfinished.size() != 0) { unfinished.getLast().addChild(node); unfinished.add(node); } else { result.add(node); unfinished.add((ElementNode) result.get(result.size() - 1)); } currentTag = ""; currentAttributes = new ArrayList<>(); break; case ' ': case '\n': state = ParserState.KEY; break; default: currentTag += c; break; } break; case CLOSING_TAG: switch (c) { case '>': state = ParserState.HTML; // IMPORTANT: we don't validate that closing tags correspond to an open tag if (!isSelfClosingTag(currentTag)) { if (unfinished.size() != 0) { unfinished.removeLast(); } } currentTag = ""; break; case ' ': case '\n': break; default: currentTag += c; break; } break; case KEY: switch (c) { case '>': state = ParserState.HTML; var node = new ElementNode(currentTag, currentAttributes); if (unfinished.size() != 0) { unfinished.getLast().addChild(node); unfinished.add(node); } else { result.add(node); unfinished.add((ElementNode) result.get(result.size() - 1)); } currentTag = ""; currentAttributes = new ArrayList<>(); break; case '=': state = ParserState.VALUE; break; case ' ': case '\n': break; default: currentKey += c; break; } break; case VALUE: switch (c) { case '\'': state = ParserState.SINGLE_QUOTE; break; case '\"': state = ParserState.DOUBLE_QUOTE; break; case ' ': case '\n': currentAttributes.add(new Pair<>(currentKey, currentValue)); currentKey = ""; currentValue = ""; case '>': if (!currentKey.equals("") || !currentValue.equals("")) { currentAttributes.add(new Pair<>(currentKey, currentValue)); currentKey = ""; currentValue = ""; } state = ParserState.HTML; var node = new ElementNode(currentTag, currentAttributes); if (unfinished.size() != 0) { unfinished.getLast().addChild(node); unfinished.add(node); } else { result.add(node); unfinished.add((ElementNode) result.get(result.size() - 1)); } currentTag = ""; currentAttributes = new ArrayList<>(); break; default: currentValue += c; break; } break; case SINGLE_QUOTE: switch (c) { case '\'': if (previousChar != '\\') { state = ParserState.VALUE; previousChar = '\0'; } else { currentValue = currentValue.substring(0, currentValue.length() - 2); currentValue += c; previousChar = c; } break; default: currentValue += c; previousChar = c; break; } break; case DOUBLE_QUOTE: switch (c) { case '\"': if (previousChar != '\\') { state = ParserState.VALUE; previousChar = '\0'; } else { currentValue = currentValue.substring(0, currentValue.length() - 2); currentValue += c; previousChar = c; } default: currentValue += c; previousChar = c; break; } break; } } return result; } private static boolean isSelfClosingTag(String tag) { switch (tag) { case "input": case "param": case "br": case "hr": case "wbr": case "img": case "embed": case "area": case "meta": case "base": case "link": case "source": case "track": case "col": return true; default: return false; } } } /* j-james

j-james