package model.html;
import java.util.*;
import model.util.Node;
import org.javatuples.*;
/*
* HTML ::= '' (NODE)*
* NODE ::= '<'TAG (' ' WORD '=' ('"'TEXT'"' | TEXT))* '>' (NODE)* '' TAG '>'
* | '<'SINGLE_TAG (' ' WORD '=' ('"'TEXT'"' | TEXT))* ('>'|'/>')
* | (TEXT | NODE)*
* TEXT ::= UNICODE - {'"'} + {'\"'}
* TAG ::= 'body' | 'div' | ...
* SINGLE_TAG ::= 'img' | ...
* (note that \forall T \in SINGLE_TAG, T \notin TAG)
*/
public class HtmlParser {
private enum ParserState {
HTML, IGNORED,
OPENING_TAG, KEY, VALUE,
SINGLE_QUOTE, DOUBLE_QUOTE,
UNKNOWN_TAG, CLOSING_TAG,
}
public static ArrayList parseHtmlLL(String input) {
var result = new ArrayList();
var unfinished = new ArrayDeque();
var currentTag = "";
var currentAttributes = new ArrayList>();
var currentKey = "";
var currentValue = "";
var currentText = "";
var previousChar = '\0'; // important for quote escapes, and multiple whitespace chars
// We safely? assume to start outside of all nodes.
ParserState state = ParserState.HTML;
for (char c : input.toCharArray()) {
// System.out.print(state);
// System.out.println(" " + c + " " + currentText);
switch (state) {
case HTML:
switch (c) {
case '<':
state = ParserState.UNKNOWN_TAG;
if (!currentText.equals("")) {
if (unfinished.size() != 0) {
unfinished.getLast().addChild(new TextNode(currentText));
} else {
result.add(new TextNode(currentText));
}
currentText = "";
previousChar = '\0';
}
break; // FOOTGUN LANGUAGE DESIGN
case ' ': case '\n':
if (previousChar != ' ') {
currentText += ' ';
}
previousChar = ' ';
break;
default:
currentText += c;
previousChar = c;
break;
}
break;
case UNKNOWN_TAG:
switch (c) {
case '/':
state = ParserState.CLOSING_TAG;
break;
case '>': // Why would you put <> in your HTML??? go away
state = ParserState.HTML;
currentText += "<>";
System.out.println("Why would you put <> in your HTML??? go away");
break;
// For now, we'll straight-up ignore anything matching the syntax:
// i.e. comments, and
case '!':
state = ParserState.IGNORED;
break;
default:
state = ParserState.OPENING_TAG;
currentTag += c;
break;
}
break; // FOOTGUN LANGUAGE DESIGN STRIKES AGAIN
case IGNORED:
switch (c) {
case '>':
state = ParserState.HTML;
break;
default:
break;
}
break;
case OPENING_TAG:
switch (c) {
case '>':
state = ParserState.HTML;
var node = new ElementNode(currentTag, currentAttributes);
if (unfinished.size() != 0) {
unfinished.getLast().addChild(node);
unfinished.add(node);
} else {
result.add(node);
unfinished.add((ElementNode) result.get(result.size() - 1));
}
currentTag = "";
currentAttributes = new ArrayList<>();
break;
case ' ': case '\n':
state = ParserState.KEY;
break;
default:
currentTag += c;
break;
}
break;
case CLOSING_TAG:
switch (c) {
case '>':
state = ParserState.HTML;
// IMPORTANT: we don't validate that closing tags correspond to an open tag
if (!isSelfClosingTag(currentTag)) {
if (unfinished.size() != 0) {
unfinished.removeLast();
}
}
currentTag = "";
break;
case ' ': case '\n':
break;
default:
currentTag += c;
break;
}
break;
case KEY:
switch (c) {
case '>':
state = ParserState.HTML;
var node = new ElementNode(currentTag, currentAttributes);
if (unfinished.size() != 0) {
unfinished.getLast().addChild(node);
unfinished.add(node);
} else {
result.add(node);
unfinished.add((ElementNode) result.get(result.size() - 1));
}
currentTag = "";
currentAttributes = new ArrayList<>();
break;
case '=':
state = ParserState.VALUE;
break;
case ' ': case '\n':
break;
default:
currentKey += c;
break;
}
break;
case VALUE:
switch (c) {
case '\'':
state = ParserState.SINGLE_QUOTE;
break;
case '\"':
state = ParserState.DOUBLE_QUOTE;
break;
case ' ': case '\n':
currentAttributes.add(new Pair<>(currentKey, currentValue));
currentKey = "";
currentValue = "";
case '>':
if (!currentKey.equals("") || !currentValue.equals("")) {
currentAttributes.add(new Pair<>(currentKey, currentValue));
currentKey = "";
currentValue = "";
}
state = ParserState.HTML;
var node = new ElementNode(currentTag, currentAttributes);
if (unfinished.size() != 0) {
unfinished.getLast().addChild(node);
unfinished.add(node);
} else {
result.add(node);
unfinished.add((ElementNode) result.get(result.size() - 1));
}
currentTag = "";
currentAttributes = new ArrayList<>();
break;
default:
currentValue += c;
break;
}
break;
case SINGLE_QUOTE:
switch (c) {
case '\'':
if (previousChar != '\\') {
state = ParserState.VALUE;
previousChar = '\0';
} else {
currentValue = currentValue.substring(0, currentValue.length() - 2);
currentValue += c;
previousChar = c;
}
break;
default:
currentValue += c;
previousChar = c;
break;
}
break;
case DOUBLE_QUOTE:
switch (c) {
case '\"':
if (previousChar != '\\') {
state = ParserState.VALUE;
previousChar = '\0';
} else {
currentValue = currentValue.substring(0, currentValue.length() - 2);
currentValue += c;
previousChar = c;
}
default:
currentValue += c;
previousChar = c;
break;
}
break;
}
}
return result;
}
private static boolean isSelfClosingTag(String tag) {
switch (tag) {
case "input": case "param":
case "br": case "hr": case "wbr":
case "img": case "embed": case "area":
case "meta": case "base": case "link":
case "source": case "track": case "col":
return true;
default:
return false;
}
}
}
/*
j-james