aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorj-james2022-10-17 06:58:44 +0000
committerj-james2022-10-17 07:00:53 +0000
commit0caf1994dae8e88f7c219bedd87b65190b88aa89 (patch)
tree6b9302c64eb74194e8ed1f267ec711c038b514cd /src
parent3e9bb5fae16c35938bc1f7f7669c12cc355c9331 (diff)
Implement LL(1) parsers for HTML and CSS
Diffstat (limited to 'src')
-rw-r--r--src/main/model/css/CssParser.java247
-rw-r--r--src/main/model/css/CssTree.java54
-rw-r--r--src/main/model/html/HtmlParser.java288
-rw-r--r--src/main/model/html/HtmlTree.java33
-rw-r--r--src/test/model/CssParserTest.java15
-rw-r--r--src/test/model/HtmlParserTest.java34
6 files changed, 671 insertions, 0 deletions
diff --git a/src/main/model/css/CssParser.java b/src/main/model/css/CssParser.java
new file mode 100644
index 0000000..5f78f0a
--- /dev/null
+++ b/src/main/model/css/CssParser.java
@@ -0,0 +1,247 @@
+package model.css;
+
+import org.javatuples.*;
+
+import java.util.*;
+
+/*
+ * RULES ::= (RULE)+
+ * RULE ::= SELECTORS '{' (PROPERTY | (PROPERTY ';')*) '}'
+ * SELECTORS ::= SELECTOR (COMBINATOR SELECTOR)*
+ * SELECTOR ::= TAG | '#' WORD | '.' WORD
+ * COMBINATOR ::= '<' | '*' | '~' | ' ' | ...
+ * PROPERTY ::= ATTRIBUTE ':' VALUE
+ * ATTRIBUTE ::= 'color' | 'text' | ...
+ * VALUE ::= ??? idk lol
+ */
+
+/*
+ * body {
+ * background-color: #f0f0f2;
+ * margin: 0;
+ * padding: 0;
+ * font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI",
+ * "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
+ *
+ * }
+ * div {
+ * width: 600px;
+ * margin: 5em auto;
+ * padding: 2em;
+ * background-color: #fdfdff;
+ * border-radius: 0.5em;
+ * box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
+ * }
+ * a:link, a:visited {
+ * color: #38488f;
+ * text-decoration: none;
+ * }
+ * @media (max - width : 700px) {
+ * div {
+ * margin: 0 auto;
+ * width: auto;
+ * }
+ * }
+ */
+
+/**
+ * This class assumes that it is getting _valid CSS_: that is, the style between two tags
+ * of a style block, or the raw content of a .css file.
+ * Making sure this assumption holds is extremely important for program robustness.
+ * We do not check for validity, i.e. throw any exceptions - the driving principle of web standards is to "fail softly".
+ */
+public class CssParser {
+
+ /**
+ * CSS is nice to parse, and so we have a relatively small number of parser states.
+ */
+ private enum ParserState {
+ SELECTORS, MEDIA_SELECTORS,
+ ATTRIBUTE, VALUE, // PROPERTIES::PROPERTY::ATTRIBUTE, PROPRETIES::PROPERTY::VALUE
+ SINGLE_QUOTES, DOUBLE_QUOTES, // VALUE::SINGLE_QUOTES, VALUE::DOUBLE_QUOTES
+ }
+
+ /**
+ * Parses a (valid) CSS file in a left-to-right, leftmost-derivation style.
+ * It should be fast - I'd say something about time complexity if I knew anything about time complexity.
+ * No guarantees are made about invalid CSS files. Also, no guarantees are made about valid CSS files, lol.
+ */
+ public static ArrayList<Pair<String, ArrayList<Pair<String, String>>>> parseLL(String input) {
+
+ // parser buffers
+ // essentially the CssTree type
+ var result = new ArrayList<Pair<String, ArrayList<Pair<String, String>>>>();
+ var currentSelector = "";
+ var currentRule = new ArrayList<Pair<String, String>>();
+ var currentProperty = "";
+ var currentValue = "";
+
+ // We safely assume to start by reading a selector.
+ ParserState state = ParserState.SELECTORS;
+
+ for (char c : input.toCharArray()) {
+ // System.out.print(state);
+ // System.out.println(" " + c);
+ switch (state) {
+ case SELECTORS:
+ switch (c) {
+ case '@':
+ if (currentSelector.equals("")) {
+ state = ParserState.MEDIA_SELECTORS;
+ } else {
+ currentSelector += c;
+ }
+ break;
+ case '{':
+ state = ParserState.ATTRIBUTE;
+ break;
+ case ' ': case '\n':
+ break;
+ // todo: do better than blindly create a string; pattern match on css selectors
+ default:
+ currentSelector += c;
+ break;
+ }
+ break;
+ case MEDIA_SELECTORS:
+ switch (c) {
+ // todo: don't entirely disregard media queries, also split between @media/@...
+ case '{':
+ state = ParserState.SELECTORS;
+ // discard currentSelector
+ currentSelector = "";
+ break;
+ default:
+ currentSelector += c;
+ break;
+ }
+ break;
+ case ATTRIBUTE:
+ switch (c) {
+ case ':':
+ state = ParserState.VALUE;
+ break;
+ case '}':
+ state = ParserState.SELECTORS;
+ if (!currentValue.equals("") || !currentProperty.equals("")) {
+ System.out.println("something's wrong");
+ currentProperty = "";
+ currentValue = "";
+ }
+ result.add(new Pair<>(currentSelector, currentRule));
+ System.out.println(currentRule);
+ currentSelector = "";
+ currentRule = new ArrayList<>();
+ break;
+ case ' ': case '\n':
+ break;
+ default:
+ currentProperty += c;
+ break;
+ }
+ break;
+ case VALUE:
+ switch (c) {
+ case ';':
+ state = ParserState.ATTRIBUTE;
+ currentRule.add(new Pair<>(currentProperty, currentValue));
+ currentProperty = "";
+ currentValue = "";
+ break;
+ case '}':
+ state = ParserState.SELECTORS;
+ if (!currentValue.equals("") || !currentProperty.equals("")) {
+ currentRule.add(new Pair<>(currentProperty, currentValue));
+ currentProperty = "";
+ currentValue = "";
+ }
+ result.add(new Pair<>(currentSelector, currentRule));
+ currentSelector = "";
+ currentRule = new ArrayList<>();
+ break;
+ case '\'':
+ state = ParserState.SINGLE_QUOTES;
+ currentValue += c;
+ break;
+ case '\"':
+ state = ParserState.DOUBLE_QUOTES;
+ currentValue += c;
+ break;
+ case ' ': case '\n':
+ break;
+ default:
+ currentValue += c;
+ break;
+ }
+ break;
+ // quotes in css are exclusively? for paths: so we want to include the quotes themselves
+ case SINGLE_QUOTES:
+ switch (c) {
+ case '\'':
+ state = ParserState.VALUE;
+ currentValue += c;
+ break;
+ default:
+ currentValue += c;
+ break;
+ }
+ break;
+ case DOUBLE_QUOTES:
+ switch (c) {
+ case '\"':
+ state = ParserState.VALUE;
+ currentValue += c;
+ break;
+ default:
+ currentValue += c;
+ break;
+ }
+ break;
+ }
+ }
+ return result;
+ }
+
+ /**
+ * Takes an input string with units and returns out the value in pixels.
+ * This is a fault-tolerant system.
+ * When given an invalid string (i.e. "12p53x"), it will produce an invalid result instead of throwing.
+ * However, it should parse every valid string correctly.
+ */
+ private double parseUnits(String input) {
+ String numbers = "";
+ String units = "";
+ // imagine making a language without iterable strings, fml
+ for (int i = 0; i < input.length(); i++) {
+ char c = input.charAt(i);
+ if (Character.isDigit(c) || c == '.' || c == '-') {
+ numbers += c;
+ } else {
+ units += c;
+ }
+ }
+ double value;
+ try {
+ value = Float.parseFloat(numbers);
+ } catch (NumberFormatException e) {
+ System.out.printf("Did not parse a float from %s, proceeding with value 0.0...%n", numbers);
+ value = 0.0;
+ }
+ // god case/break is such a fault-provoking design i hate it
+ // good thing we avoid breaks entirely here lmao
+ switch (units) {
+ // absolute units
+ case "px": return value;
+ case "pc": return value * 16;
+ case "pt": return value * (4.0 / 3.0);
+ case "cm": return value * 37.8;
+ case "mm": return value * 378;
+ case "Q": return value * 1512;
+ case "in": return value * 96;
+ // not handled: % em ex ch rem lh rlh vw vh vmin vmax vb vi svw svh lvw lvh dvw dvh
+ default:
+ System.out.printf("Unit %s not implemented, defaulting to %s in pixels...%n", units, value);
+ return value;
+ }
+ }
+}
diff --git a/src/main/model/css/CssTree.java b/src/main/model/css/CssTree.java
new file mode 100644
index 0000000..1829327
--- /dev/null
+++ b/src/main/model/css/CssTree.java
@@ -0,0 +1,54 @@
+package model.css;
+
+import java.util.ArrayList;
+
+/**
+ * This isn't really a tree. Do I even need this class?
+ */
+public class CssTree {
+ public class CssProperty {
+ private String attribute;
+ private String value;
+
+ public CssProperty(String attribute, String value) {
+ this.attribute = attribute;
+ this.value = value;
+ }
+
+ public String getAttribute() {
+ return attribute;
+ }
+
+ public String getValue() {
+ return value;
+ }
+ }
+
+ public class CssRule {
+ private String selectors;
+ private ArrayList<CssProperty> properties;
+
+ public CssRule(String selectors, ArrayList<CssProperty> properties) {
+ this.selectors = selectors;
+ this.properties = properties;
+ }
+
+ public String getSelectors() {
+ return this.selectors;
+ }
+
+ public ArrayList<CssProperty> getProperties() {
+ return this.properties;
+ }
+ }
+
+ private ArrayList<CssRule> rules;
+
+ public CssTree(ArrayList<CssRule> rules) {
+ this.rules = rules;
+ }
+
+ public ArrayList<CssRule> getRules() {
+ return this.rules;
+ }
+}
diff --git a/src/main/model/html/HtmlParser.java b/src/main/model/html/HtmlParser.java
new file mode 100644
index 0000000..5109e62
--- /dev/null
+++ b/src/main/model/html/HtmlParser.java
@@ -0,0 +1,288 @@
+package model.html;
+
+import java.util.*;
+
+import model.html.HtmlTree;
+import org.javatuples.*;
+
+
+/*
+
+<!DOCTYPE html>
+<html>
+<head>
+ <title>j-james</title>
+ <meta charset="utf-8"/>
+ <meta name="viewport" content="width=device-width"/>
+ <link rel="icon" type="image/jpg" href="assets/compass.jpg"/>
+ <link rel="stylesheet" href="css/normalize.css"/>
+ <link rel="stylesheet" href="css/style.css"/>
+</head>
+<body>
+ <header>
+ <h1>
+ <a href="https://j-james.me">j-james</a>
+ </h1>
+ <nav>
+ <a href="https://j-james.me/about">about</a>
+ <a href="https://j-james.me/resume">resume</a>
+ <a href="https://j-james.me/posts">posts</a>
+ <a href="https://j-james.me/writeups">writeups</a>
+ </nav>
+ </header>
+ <main>
+ <div id="intro">
+ <img id="face" src="assets/compass.jpg"/>
+ <div id="profile">
+ <p> Hello, I'm JJ, and I go by j-james on the Internet. </p>
+ <p> I'm a second-year student at the <a href="https://ubc.ca">University of British Columbia</a>, flag hunter for <a href="https://ubcctf.github.io">Maple Bacon</a>, embedded programmer on <a href="https://ubcbionics.com/">UBC Bionics</a>, and occasional ultimate frisbee and roller/ice hockey player.</p>
+ <p> Outside of school, sports, and social life, I enjoy building and contributing to <a href="https://www.gnu.org/philosophy/free-sw">free-and-open-source</a> projects. The majority of my work can either be found on <a href="https://github.com/j-james">GitHub</a> or at <a href="https://sr.ht/~j-james">SourceHut</a>. </p>
+ </div>
+ </div>
+ <!-- <div id="details">
+ <h2>Projects</h2>
+ <p> Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. </p>
+ <h2>Posts</h2>
+ <p> Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. </p>
+ </div> -->
+ </main>
+ <footer>
+ <span><img src="assets/copyleft.svg" width="12" height="12"/> 2020-2022 j-james </span>
+ </footer>
+</body>
+</html>
+<!--
+
+*/
+
+/*
+ * HTML ::= '<!DOCTYPE html>' (NODE)*
+ * NODE ::= '<'TAG (' ' WORD '=' ('"'TEXT'"' | TEXT))* '>' (NODE)* '</' TAG '>'
+ * | '<'SINGLE_TAG (' ' WORD '=' ('"'TEXT'"' | TEXT))* ('>'|'/>')
+ * | (TEXT | NODE)*
+ * TEXT ::= UNICODE - {'"'} + {'\"'}
+ * TAG ::= 'body' | 'div' | ...
+ * SINGLE_TAG ::= 'img' | ...
+ * (note that \forall T \in SINGLE_TAG, T \notin TAG)
+ */
+public class HtmlParser {
+
+ private enum ParserState {
+ HTML,
+ OPENING_TAG, KEY, VALUE,
+ SINGLE_QUOTE, DOUBLE_QUOTE,
+ UNKNOWN_TAG, CLOSING_TAG,
+ }
+
+ public static ArrayList<HtmlTree> parseHtmlLL(String input) {
+
+ var result = new ArrayList<HtmlTree>();
+ var unfinished = new ArrayDeque<HtmlTree>();
+ var currentTag = "";
+ var currentAttributes = new ArrayList<Pair<String, String>>();
+ var currentKey = "";
+ var currentValue = "";
+ var currentText = "";
+
+ // We safely? assume to start outside of all nodes.
+ ParserState state = ParserState.HTML;
+
+ for (char c : input.toCharArray()) {
+ switch (state) {
+ case HTML:
+ switch (c) {
+ case '<':
+ if (!currentText.equals("")) {
+ // unfinished.add(text) idk
+ }
+
+ default:
+ currentText += c;
+ break;
+ }
+ break;
+ case UNKNOWN_TAG:
+ switch (c) {
+ case '/':
+ state = ParserState.CLOSING_TAG;
+ break;
+ case '>':
+ state = ParserState.HTML;
+ System.out.println("Why would you put <> in your HTML??? go away");
+ break;
+ default:
+ state = ParserState.OPENING_TAG;
+ currentTag += c;
+ break;
+ }
+ case OPENING_TAG:
+ switch (c) {
+ case '>':
+ state = ParserState.HTML;
+ // unfinished.add(new HtmlTree(tag)
+ currentTag = "";
+ break;
+ case ' ': case '\n':
+ state = ParserState.KEY;
+ break;
+ default:
+ currentTag += c;
+ break;
+ }
+ break;
+ case CLOSING_TAG:
+ switch (c) {
+ case '>':
+ // IMPORTANT: we don't validate that closing tags correspond to an open tag
+ if (!isSelfClosingTag(currentTag)) {
+ //unknown.pop
+ }
+ break;
+ case ' ': case '\n':
+ break;
+ default:
+ currentTag += c;
+ break;
+ }
+ break;
+ case KEY:
+ switch (c) {
+ case '>':
+ state = ParserState.HTML;
+ if (currentAttributes.size() != 0) {
+ // unfinished.something idk new HtmlTree(tag=currentTag, attributes=currentAttributes)
+ currentAttributes.clear();
+ } else {
+ // unfinished.add(new HtmlTree(tag)
+ }
+ currentTag = "";
+ break;
+ case '=':
+ state = ParserState.VALUE;
+ break;
+ case ' ': case '\n':
+ break;
+ default:
+ currentKey += c;
+ break;
+ }
+ break;
+ case VALUE:
+ switch (c) {
+ case '\'':
+ state = ParserState.SINGLE_QUOTE;
+ break;
+ case '\"':
+ state = ParserState.DOUBLE_QUOTE;
+ break;
+ case ' ': case '\n':
+ currentAttributes.add(new Pair<>(currentKey, currentValue));
+ currentKey = "";
+ currentValue = "";
+ case '>':
+ if (!currentKey.equals("") || !currentValue.equals("")) {
+ currentAttributes.add(new Pair<>(currentKey, currentValue));
+ currentKey = "";
+ currentValue = "";
+ }
+ // unfinished.something idk new HtmlTree(tag=currentTag, attributes=currentAttributes)
+ currentAttributes.clear();
+ default:
+ currentValue += c;
+ break;
+ }
+ break;
+ case SINGLE_QUOTE:
+ switch (c) {
+ case '\'':
+ state = ParserState.VALUE;
+ default:
+ currentValue += c;
+ break;
+ }
+ break;
+ case DOUBLE_QUOTE:
+ switch (c) {
+ default:
+ break;
+ }
+ break;
+ }
+ }
+ return result;
+ }
+
+
+ private static boolean isSelfClosingTag(String tag) {
+ switch (tag) {
+ case "input": case "param":
+ case "br": case "hr": case "wbr":
+ case "img": case "embed": case "area":
+ case "meta": case "base": case "link":
+ case "source": case "track": case "col":
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ /*
+ public static void parseHtmlLL(String input) {
+ String tag = "";
+ ArrayList<Pair<String, String>> attributes = new ArrayList<>();
+ boolean inTag = false;
+ boolean inAttribute = false; // for checking if we're in quotes
+
+ for (int i = 0; i < input.length(); i++) {
+ if (inTag) {
+ if (inAttribute) {
+ switch (input.charAt(i)) {
+ case '\"'
+ }
+ } else {
+ switch (input.charAt(i)) {
+
+ }
+ }
+
+
+ } else {
+ switch (input.charAt(i)) {
+ case '<':
+ }
+ }
+ }
+ }
+
+ private static void parseAttribute(String input) {
+
+ }
+*/
+
+/*
+ public static void parseHTML(ArrayList<String> input) {
+ String data = "";
+ ArrayList<ParseTree> children = new ArrayList<ParseTree>();
+
+ boolean inTag = false;
+ boolean tagComplete = false;
+
+ for (String i : input) {
+ if (inTag) {
+ if (i.equals(">")) {
+ inTag = false;
+ tagComplete = true;
+ // remove ending tags and recursively parse out children
+ } else {
+ data += i;
+ }
+ } else {
+ if (i.equals("<")) {
+ inTag = true;
+ }
+ }
+
+ }
+
+ }*/
+}
diff --git a/src/main/model/html/HtmlTree.java b/src/main/model/html/HtmlTree.java
new file mode 100644
index 0000000..1aae0a8
--- /dev/null
+++ b/src/main/model/html/HtmlTree.java
@@ -0,0 +1,33 @@
+package model.html;
+
+import model.util.AbstractTree;
+import org.javatuples.Pair;
+
+import java.util.ArrayList;
+import java.util.Optional;
+
+/**
+ * Representation of HTML as a tree of nodes. Sorry about the generics.
+ */
+public class HtmlTree extends AbstractTree<Pair<String, ArrayList<Pair<String, String>>>> {
+ private String tag;
+ private ArrayList<Pair<String, String>> attributes;
+ private Optional<HtmlTree> parent = Optional.empty();
+ private Optional<HtmlTree> sibling = Optional.empty();
+
+ // I don't quite know why I can't say ArrayList<HtmlTree> children.
+ public HtmlTree(String tag, ArrayList<Pair<String, String>> attributes,
+ ArrayList<AbstractTree<Pair<String, ArrayList<Pair<String, String>>>>> children,
+ Optional<HtmlTree> parent, Optional<HtmlTree> sibling) {
+ super(new Pair<>(tag, attributes), children);
+ this.tag = tag;
+ this.attributes = attributes;
+ this.parent = parent;
+ this.sibling = sibling;
+ }
+
+ public HtmlTree(String tag, ArrayList<Pair<String, String>> attributes) {
+ this(tag, attributes, new ArrayList<AbstractTree<Pair<String, ArrayList<Pair<String, String>>>>>(),
+ Optional.empty(), Optional.empty());
+ }
+}
diff --git a/src/test/model/CssParserTest.java b/src/test/model/CssParserTest.java
new file mode 100644
index 0000000..2852da5
--- /dev/null
+++ b/src/test/model/CssParserTest.java
@@ -0,0 +1,15 @@
+package model;
+
+import model.css.CssParser;
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+public class CssParserTest {
+
+ @Test
+ void testIdiomaticCss() {
+ var idiomaticCss = "body { background-color: #f0f0f2; margin: 0; padding: 0; font-family: -apple-system, system-ui, BlinkMacSystemFont, \"Segoe UI\", \"Open Sans\", \"Helvetica Neue\", Helvetica, Arial, sans-serif;}div { width: 600px; margin: 5em auto; padding: 2em; background-color: #fdfdff; border-radius: 0.5em; box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);}a:link, a:visited { color: #38488f; text-decoration: none;}@media (max - width : 700px) { div { margin: 0 auto; width: auto; }}";
+ System.out.println(CssParser.parseLL(idiomaticCss));
+ }
+}
diff --git a/src/test/model/HtmlParserTest.java b/src/test/model/HtmlParserTest.java
new file mode 100644
index 0000000..e83c857
--- /dev/null
+++ b/src/test/model/HtmlParserTest.java
@@ -0,0 +1,34 @@
+package model;
+
+import model.html.HtmlParser;
+import org.junit.jupiter.api.Test;
+
+import java.util.Arrays;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+public class HtmlParserTest {
+
+ String idiomaticHtml = "<!DOCTYPE html><html><head></head><body><p>Hello,world!</p></body></html>";
+ String brokenHtml = "<html><foo><bar></bar><ba";
+ String trailingTextHtml = "<html><foo><bar></bar>ba";
+
+ @Test
+ void testIdiomaticHtml() {
+ String[] idiomaticHtmlArray = {"<!DOCTYPE html>","<html>","<head>","</head>","<body>","<p>","Hello,world!","</p>","</body>","</html>"};
+ System.out.println(HtmlParser.parseHtmlLL(idiomaticHtml));
+// assertEquals(HtmlParser.parseHtmlLL(idiomaticHtml), Arrays.asList(idiomaticHtmlArray));
+ }
+
+ @Test
+ void testBrokenHtml() {
+ String[] brokenHtmlArray = {"<html>","<foo>","<bar>","</bar>","<ba>"};
+// assertEquals(HtmlParser.parseHtmlLL(brokenHtml), Arrays.asList(brokenHtmlArray));
+ }
+
+ @Test
+ void testTrailingTextHtml() {
+ String[] trailingTextHtmlArray = {"<html>","<foo>","<bar>","</bar>","ba"};
+// assertEquals(HtmlParser.parseHtmlLL(trailingTextHtml), Arrays.asList(trailingTextHtmlArray));
+ }
+}