aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorj-james2022-10-17 09:10:50 +0000
committerj-james2022-10-17 09:10:50 +0000
commit453372247c8c173c16fa2234b9645bf7a542ed8d (patch)
tree97b45f4063f706f8bd974958fe317f96b2044c53
parent2bb1c153b693095b6bbafdfad139791817280af4 (diff)
Some edge cases: check for escaped quotes and ignore multiple whitespace chars
-rw-r--r--src/main/model/css/CssParser.java81
-rw-r--r--src/main/model/html/HtmlParser.java142
-rw-r--r--src/test/model/HtmlParserTest.java25
3 files changed, 155 insertions, 93 deletions
diff --git a/src/main/model/css/CssParser.java b/src/main/model/css/CssParser.java
index 5f78f0a..8d57bdc 100644
--- a/src/main/model/css/CssParser.java
+++ b/src/main/model/css/CssParser.java
@@ -15,35 +15,6 @@ import java.util.*;
* VALUE ::= ??? idk lol
*/
-/*
- * body {
- * background-color: #f0f0f2;
- * margin: 0;
- * padding: 0;
- * font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI",
- * "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
- *
- * }
- * div {
- * width: 600px;
- * margin: 5em auto;
- * padding: 2em;
- * background-color: #fdfdff;
- * border-radius: 0.5em;
- * box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
- * }
- * a:link, a:visited {
- * color: #38488f;
- * text-decoration: none;
- * }
- * @media (max - width : 700px) {
- * div {
- * margin: 0 auto;
- * width: auto;
- * }
- * }
- */
-
/**
* This class assumes that it is getting _valid CSS_: that is, the style between two tags
* of a style block, or the raw content of a .css file.
@@ -75,6 +46,7 @@ public class CssParser {
var currentRule = new ArrayList<Pair<String, String>>();
var currentProperty = "";
var currentValue = "";
+ var previousChar = '\0';
// We safely assume to start by reading a selector.
ParserState state = ParserState.SELECTORS;
@@ -178,8 +150,15 @@ public class CssParser {
case SINGLE_QUOTES:
switch (c) {
case '\'':
- state = ParserState.VALUE;
- currentValue += c;
+ if (previousChar != '\\') {
+ state = ParserState.VALUE;
+ currentValue += c;
+ previousChar = '\0';
+ } else {
+ currentValue = currentValue.substring(0, currentValue.length() - 2);
+ currentValue += c;
+ previousChar = c;
+ }
break;
default:
currentValue += c;
@@ -189,8 +168,15 @@ public class CssParser {
case DOUBLE_QUOTES:
switch (c) {
case '\"':
- state = ParserState.VALUE;
- currentValue += c;
+ if (previousChar != '\\') {
+ state = ParserState.VALUE;
+ currentValue += c;
+ previousChar = '\0';
+ } else {
+ currentValue = currentValue.substring(0, currentValue.length() - 2);
+ currentValue += c;
+ previousChar = c;
+ }
break;
default:
currentValue += c;
@@ -245,3 +231,32 @@ public class CssParser {
}
}
}
+
+/*
+ * body {
+ * background-color: #f0f0f2;
+ * margin: 0;
+ * padding: 0;
+ * font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI",
+ * "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
+ *
+ * }
+ * div {
+ * width: 600px;
+ * margin: 5em auto;
+ * padding: 2em;
+ * background-color: #fdfdff;
+ * border-radius: 0.5em;
+ * box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
+ * }
+ * a:link, a:visited {
+ * color: #38488f;
+ * text-decoration: none;
+ * }
+ * @media (max - width : 700px) {
+ * div {
+ * margin: 0 auto;
+ * width: auto;
+ * }
+ * }
+ */
diff --git a/src/main/model/html/HtmlParser.java b/src/main/model/html/HtmlParser.java
index 6ad5af4..d6b4ff1 100644
--- a/src/main/model/html/HtmlParser.java
+++ b/src/main/model/html/HtmlParser.java
@@ -6,54 +6,6 @@ import model.util.Node;
import org.javatuples.*;
/*
-<!DOCTYPE html>
-<html>
-<head>
- <title>j-james</title>
- <meta charset="utf-8"/>
- <meta name="viewport" content="width=device-width"/>
- <link rel="icon" type="image/jpg" href="assets/compass.jpg"/>
- <link rel="stylesheet" href="css/normalize.css"/>
- <link rel="stylesheet" href="css/style.css"/>
-</head>
-<body>
- <header>
- <h1>
- <a href="https://j-james.me">j-james</a>
- </h1>
- <nav>
- <a href="https://j-james.me/about">about</a>
- <a href="https://j-james.me/resume">resume</a>
- <a href="https://j-james.me/posts">posts</a>
- <a href="https://j-james.me/writeups">writeups</a>
- </nav>
- </header>
- <main>
- <div id="intro">
- <img id="face" src="assets/compass.jpg"/>
- <div id="profile">
- <p> Hello, I'm JJ, and I go by j-james on the Internet. </p>
- <p> I'm a second-year student at the <a href="https://ubc.ca">University of British Columbia</a>, flag hunter for <a href="https://ubcctf.github.io">Maple Bacon</a>, embedded programmer on <a href="https://ubcbionics.com/">UBC Bionics</a>, and occasional ultimate frisbee and roller/ice hockey player.</p>
- <p> Outside of school, sports, and social life, I enjoy building and contributing to <a href="https://www.gnu.org/philosophy/free-sw">free-and-open-source</a> projects. The majority of my work can either be found on <a href="https://github.com/j-james">GitHub</a> or at <a href="https://sr.ht/~j-james">SourceHut</a>. </p>
- </div>
- </div>
- <!-- <div id="details">
- <h2>Projects</h2>
- <p> Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. </p>
- <h2>Posts</h2>
- <p> Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. </p>
- </div> -->
- </main>
- <footer>
- <span><img src="assets/copyleft.svg" width="12" height="12"/> 2020-2022 j-james </span>
- </footer>
-</body>
-</html>
-<!--
-
-*/
-
-/*
* HTML ::= '<!DOCTYPE html>' (NODE)*
* NODE ::= '<'TAG (' ' WORD '=' ('"'TEXT'"' | TEXT))* '>' (NODE)* '</' TAG '>'
* | '<'SINGLE_TAG (' ' WORD '=' ('"'TEXT'"' | TEXT))* ('>'|'/>')
@@ -66,7 +18,7 @@ import org.javatuples.*;
public class HtmlParser {
private enum ParserState {
- HTML,
+ HTML, IGNORED,
OPENING_TAG, KEY, VALUE,
SINGLE_QUOTE, DOUBLE_QUOTE,
UNKNOWN_TAG, CLOSING_TAG,
@@ -81,7 +33,7 @@ public class HtmlParser {
var currentKey = "";
var currentValue = "";
var currentText = "";
- var previousChar = '\0';
+ var previousChar = '\0'; // important for quote escapes, and multiple whitespace chars
// We safely? assume to start outside of all nodes.
ParserState state = ParserState.HTML;
@@ -101,10 +53,18 @@ public class HtmlParser {
result.add(new TextNode(currentText));
}
currentText = "";
+ previousChar = '\0';
}
break; // FOOTGUN LANGUAGE DESIGN
+ case ' ': case '\n':
+ if (previousChar != ' ') {
+ currentText += ' ';
+ }
+ previousChar = ' ';
+ break;
default:
currentText += c;
+ previousChar = c;
break;
}
break;
@@ -118,20 +78,31 @@ public class HtmlParser {
currentText += "<>";
System.out.println("Why would you put <> in your HTML??? go away");
break;
- // Currently doesn't handle <!DOCTYPE> different from any other tag
- case '!': default:
+ // For now, we'll straight-up ignore anything matching the <!...> syntax:
+ // i.e. comments, and <!DOCTYPE html>
+ case '!':
+ state = ParserState.IGNORED;
+ break;
+ default:
state = ParserState.OPENING_TAG;
currentTag += c;
break;
}
break; // FOOTGUN LANGUAGE DESIGN STRIKES AGAIN
+ case IGNORED:
+ switch (c) {
+ case '>':
+ state = ParserState.HTML;
+ break;
+ default:
+ break;
+ }
+ break;
case OPENING_TAG:
switch (c) {
case '>':
state = ParserState.HTML;
var node = new ElementNode(currentTag, currentAttributes);
- System.out.println("Adding ElementNode " + currentTag);
- System.out.println("Current size of unfinished: " + unfinished.size());
if (unfinished.size() != 0) {
unfinished.getLast().addChild(node);
unfinished.add(node);
@@ -174,8 +145,6 @@ public class HtmlParser {
case '>':
state = ParserState.HTML;
var node = new ElementNode(currentTag, currentAttributes);
- System.out.println("Adding ElementNode " + currentTag);
- System.out.println("Current size of unfinished: " + unfinished.size());
if (unfinished.size() != 0) {
unfinished.getLast().addChild(node);
unfinished.add(node);
@@ -216,8 +185,6 @@ public class HtmlParser {
}
state = ParserState.HTML;
var node = new ElementNode(currentTag, currentAttributes);
- System.out.println("Adding ElementNode " + currentTag);
- System.out.println("Current size of unfinished: " + unfinished.size());
if (unfinished.size() != 0) {
unfinished.getLast().addChild(node);
unfinished.add(node);
@@ -240,6 +207,7 @@ public class HtmlParser {
state = ParserState.VALUE;
previousChar = '\0';
} else {
+ currentValue = currentValue.substring(0, currentValue.length() - 2);
currentValue += c;
previousChar = c;
}
@@ -257,6 +225,7 @@ public class HtmlParser {
state = ParserState.VALUE;
previousChar = '\0';
} else {
+ currentValue = currentValue.substring(0, currentValue.length() - 2);
currentValue += c;
previousChar = c;
}
@@ -284,3 +253,60 @@ public class HtmlParser {
}
}
}
+
+/*
+<!DOCTYPE html>
+<html>
+<head>
+ <title>j-james</title>
+ <meta charset="utf-8"/>
+ <meta name="viewport" content="width=device-width"/>
+ <link rel="icon" type="image/jpg" href="assets/compass.jpg"/>
+ <link rel="stylesheet" href="css/normalize.css"/>
+ <link rel="stylesheet" href="css/style.css"/>
+</head>
+<body>
+ <header>
+ <h1>
+ <a href="https://j-james.me">j-james</a>
+ </h1>
+ <nav>
+ <a href="https://j-james.me/about">about</a>
+ <a href="https://j-james.me/resume">resume</a>
+ <a href="https://j-james.me/posts">posts</a>
+ <a href="https://j-james.me/writeups">writeups</a>
+ </nav>
+ </header>
+ <main>
+ <div id="intro">
+ <img id="face" src="assets/compass.jpg"/>
+ </div>
+ <!-- <div id="details">
+ <h2>Projects</h2>
+ <p> Lorem ipsum dolor sit amet, consectetur adipiscing elit,
+ sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
+ Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris
+ nisi ut aliquip ex ea commodo consequat.
+ Duis aute irure dolor in reprehenderit in voluptate velit esse cillum
+ dolore eu fugiat nulla pariatur.
+ Excepteur sint occaecat cupidatat non proident, sunt in culpa
+ qui officia deserunt mollit anim id est laborum. </p>
+ <h2>Posts</h2>
+ <p> Lorem ipsum dolor sit amet, consectetur adipiscing elit,
+ sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
+ Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris
+ nisi ut aliquip ex ea commodo consequat.
+ Duis aute irure dolor in reprehenderit in voluptate velit esse cillum
+ dolore eu fugiat nulla pariatur.
+ Excepteur sint occaecat cupidatat non proident, sunt in culpa
+ qui officia deserunt mollit anim id est laborum. </p>
+ </div> -->
+ </main>
+ <footer>
+ <span><img src="assets/copyleft.svg" width="12" height="12"/> 2020-2022 j-james </span>
+ </footer>
+</body>
+</html>
+<!--
+
+*/
diff --git a/src/test/model/HtmlParserTest.java b/src/test/model/HtmlParserTest.java
index e83c857..4b05cfb 100644
--- a/src/test/model/HtmlParserTest.java
+++ b/src/test/model/HtmlParserTest.java
@@ -1,21 +1,25 @@
package model;
+import model.html.ElementNode;
import model.html.HtmlParser;
+import model.util.Node;
import org.junit.jupiter.api.Test;
-import java.util.Arrays;
+import java.util.*;
import static org.junit.jupiter.api.Assertions.*;
public class HtmlParserTest {
- String idiomaticHtml = "<!DOCTYPE html><html><head></head><body><p>Hello,world!</p></body></html>";
+ String idiomaticHtml = "<!DOCTYPE html><html><head></head><body><p>Hello, world!</p></body></html>";
String brokenHtml = "<html><foo><bar></bar><ba";
String trailingTextHtml = "<html><foo><bar></bar>ba";
@Test
void testIdiomaticHtml() {
String[] idiomaticHtmlArray = {"<!DOCTYPE html>","<html>","<head>","</head>","<body>","<p>","Hello,world!","</p>","</body>","</html>"};
+ var parsedHtml = HtmlParser.parseHtmlLL(idiomaticHtml);
+ displayHtmlTree(parsedHtml);
System.out.println(HtmlParser.parseHtmlLL(idiomaticHtml));
// assertEquals(HtmlParser.parseHtmlLL(idiomaticHtml), Arrays.asList(idiomaticHtmlArray));
}
@@ -23,12 +27,29 @@ public class HtmlParserTest {
@Test
void testBrokenHtml() {
String[] brokenHtmlArray = {"<html>","<foo>","<bar>","</bar>","<ba>"};
+ System.out.println(HtmlParser.parseHtmlLL(brokenHtml));
// assertEquals(HtmlParser.parseHtmlLL(brokenHtml), Arrays.asList(brokenHtmlArray));
}
@Test
void testTrailingTextHtml() {
String[] trailingTextHtmlArray = {"<html>","<foo>","<bar>","</bar>","ba"};
+ System.out.println(HtmlParser.parseHtmlLL(trailingTextHtml));
// assertEquals(HtmlParser.parseHtmlLL(trailingTextHtml), Arrays.asList(trailingTextHtmlArray));
}
+
+ private void displayHtmlTree(ArrayList<Node> tree) {
+ for (Node node : tree) {
+ if (node instanceof ElementNode) {
+ System.out.print(((ElementNode) node).getTag() + ": ");
+ for (Node n : ((ElementNode) node).getChildren()) {
+ System.out.print(n.getData() + " ");
+ }
+ System.out.println();
+ displayHtmlTree(((ElementNode) node).getChildren());
+ } else {
+ System.out.println("Text: " + node.getData());
+ }
+ }
+ }
}