From 0caf1994dae8e88f7c219bedd87b65190b88aa89 Mon Sep 17 00:00:00 2001 From: j-james Date: Sun, 16 Oct 2022 23:58:44 -0700 Subject: Implement LL(1) parsers for HTML and CSS --- src/main/model/css/CssParser.java | 247 +++++++++++++++++++++++++++++++ src/main/model/css/CssTree.java | 54 +++++++ src/main/model/html/HtmlParser.java | 288 ++++++++++++++++++++++++++++++++++++ src/main/model/html/HtmlTree.java | 33 +++++ 4 files changed, 622 insertions(+) create mode 100644 src/main/model/css/CssParser.java create mode 100644 src/main/model/css/CssTree.java create mode 100644 src/main/model/html/HtmlParser.java create mode 100644 src/main/model/html/HtmlTree.java (limited to 'src/main') diff --git a/src/main/model/css/CssParser.java b/src/main/model/css/CssParser.java new file mode 100644 index 0000000..5f78f0a --- /dev/null +++ b/src/main/model/css/CssParser.java @@ -0,0 +1,247 @@ +package model.css; + +import org.javatuples.*; + +import java.util.*; + +/* + * RULES ::= (RULE)+ + * RULE ::= SELECTORS '{' (PROPERTY | (PROPERTY ';')*) '}' + * SELECTORS ::= SELECTOR (COMBINATOR SELECTOR)* + * SELECTOR ::= TAG | '#' WORD | '.' WORD + * COMBINATOR ::= '<' | '*' | '~' | ' ' | ... + * PROPERTY ::= ATTRIBUTE ':' VALUE + * ATTRIBUTE ::= 'color' | 'text' | ... + * VALUE ::= ??? idk lol + */ + +/* + * body { + * background-color: #f0f0f2; + * margin: 0; + * padding: 0; + * font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", + * "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif; + * + * } + * div { + * width: 600px; + * margin: 5em auto; + * padding: 2em; + * background-color: #fdfdff; + * border-radius: 0.5em; + * box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02); + * } + * a:link, a:visited { + * color: #38488f; + * text-decoration: none; + * } + * @media (max - width : 700px) { + * div { + * margin: 0 auto; + * width: auto; + * } + * } + */ + +/** + * This class assumes that it is getting _valid CSS_: that is, the style between two tags + * of a style block, or the raw content of a .css file. + * Making sure this assumption holds is extremely important for program robustness. + * We do not check for validity, i.e. throw any exceptions - the driving principle of web standards is to "fail softly". + */ +public class CssParser { + + /** + * CSS is nice to parse, and so we have a relatively small number of parser states. + */ + private enum ParserState { + SELECTORS, MEDIA_SELECTORS, + ATTRIBUTE, VALUE, // PROPERTIES::PROPERTY::ATTRIBUTE, PROPRETIES::PROPERTY::VALUE + SINGLE_QUOTES, DOUBLE_QUOTES, // VALUE::SINGLE_QUOTES, VALUE::DOUBLE_QUOTES + } + + /** + * Parses a (valid) CSS file in a left-to-right, leftmost-derivation style. + * It should be fast - I'd say something about time complexity if I knew anything about time complexity. + * No guarantees are made about invalid CSS files. Also, no guarantees are made about valid CSS files, lol. + */ + public static ArrayList>>> parseLL(String input) { + + // parser buffers + // essentially the CssTree type + var result = new ArrayList>>>(); + var currentSelector = ""; + var currentRule = new ArrayList>(); + var currentProperty = ""; + var currentValue = ""; + + // We safely assume to start by reading a selector. + ParserState state = ParserState.SELECTORS; + + for (char c : input.toCharArray()) { + // System.out.print(state); + // System.out.println(" " + c); + switch (state) { + case SELECTORS: + switch (c) { + case '@': + if (currentSelector.equals("")) { + state = ParserState.MEDIA_SELECTORS; + } else { + currentSelector += c; + } + break; + case '{': + state = ParserState.ATTRIBUTE; + break; + case ' ': case '\n': + break; + // todo: do better than blindly create a string; pattern match on css selectors + default: + currentSelector += c; + break; + } + break; + case MEDIA_SELECTORS: + switch (c) { + // todo: don't entirely disregard media queries, also split between @media/@... + case '{': + state = ParserState.SELECTORS; + // discard currentSelector + currentSelector = ""; + break; + default: + currentSelector += c; + break; + } + break; + case ATTRIBUTE: + switch (c) { + case ':': + state = ParserState.VALUE; + break; + case '}': + state = ParserState.SELECTORS; + if (!currentValue.equals("") || !currentProperty.equals("")) { + System.out.println("something's wrong"); + currentProperty = ""; + currentValue = ""; + } + result.add(new Pair<>(currentSelector, currentRule)); + System.out.println(currentRule); + currentSelector = ""; + currentRule = new ArrayList<>(); + break; + case ' ': case '\n': + break; + default: + currentProperty += c; + break; + } + break; + case VALUE: + switch (c) { + case ';': + state = ParserState.ATTRIBUTE; + currentRule.add(new Pair<>(currentProperty, currentValue)); + currentProperty = ""; + currentValue = ""; + break; + case '}': + state = ParserState.SELECTORS; + if (!currentValue.equals("") || !currentProperty.equals("")) { + currentRule.add(new Pair<>(currentProperty, currentValue)); + currentProperty = ""; + currentValue = ""; + } + result.add(new Pair<>(currentSelector, currentRule)); + currentSelector = ""; + currentRule = new ArrayList<>(); + break; + case '\'': + state = ParserState.SINGLE_QUOTES; + currentValue += c; + break; + case '\"': + state = ParserState.DOUBLE_QUOTES; + currentValue += c; + break; + case ' ': case '\n': + break; + default: + currentValue += c; + break; + } + break; + // quotes in css are exclusively? for paths: so we want to include the quotes themselves + case SINGLE_QUOTES: + switch (c) { + case '\'': + state = ParserState.VALUE; + currentValue += c; + break; + default: + currentValue += c; + break; + } + break; + case DOUBLE_QUOTES: + switch (c) { + case '\"': + state = ParserState.VALUE; + currentValue += c; + break; + default: + currentValue += c; + break; + } + break; + } + } + return result; + } + + /** + * Takes an input string with units and returns out the value in pixels. + * This is a fault-tolerant system. + * When given an invalid string (i.e. "12p53x"), it will produce an invalid result instead of throwing. + * However, it should parse every valid string correctly. + */ + private double parseUnits(String input) { + String numbers = ""; + String units = ""; + // imagine making a language without iterable strings, fml + for (int i = 0; i < input.length(); i++) { + char c = input.charAt(i); + if (Character.isDigit(c) || c == '.' || c == '-') { + numbers += c; + } else { + units += c; + } + } + double value; + try { + value = Float.parseFloat(numbers); + } catch (NumberFormatException e) { + System.out.printf("Did not parse a float from %s, proceeding with value 0.0...%n", numbers); + value = 0.0; + } + // god case/break is such a fault-provoking design i hate it + // good thing we avoid breaks entirely here lmao + switch (units) { + // absolute units + case "px": return value; + case "pc": return value * 16; + case "pt": return value * (4.0 / 3.0); + case "cm": return value * 37.8; + case "mm": return value * 378; + case "Q": return value * 1512; + case "in": return value * 96; + // not handled: % em ex ch rem lh rlh vw vh vmin vmax vb vi svw svh lvw lvh dvw dvh + default: + System.out.printf("Unit %s not implemented, defaulting to %s in pixels...%n", units, value); + return value; + } + } +} diff --git a/src/main/model/css/CssTree.java b/src/main/model/css/CssTree.java new file mode 100644 index 0000000..1829327 --- /dev/null +++ b/src/main/model/css/CssTree.java @@ -0,0 +1,54 @@ +package model.css; + +import java.util.ArrayList; + +/** + * This isn't really a tree. Do I even need this class? + */ +public class CssTree { + public class CssProperty { + private String attribute; + private String value; + + public CssProperty(String attribute, String value) { + this.attribute = attribute; + this.value = value; + } + + public String getAttribute() { + return attribute; + } + + public String getValue() { + return value; + } + } + + public class CssRule { + private String selectors; + private ArrayList properties; + + public CssRule(String selectors, ArrayList properties) { + this.selectors = selectors; + this.properties = properties; + } + + public String getSelectors() { + return this.selectors; + } + + public ArrayList getProperties() { + return this.properties; + } + } + + private ArrayList rules; + + public CssTree(ArrayList rules) { + this.rules = rules; + } + + public ArrayList getRules() { + return this.rules; + } +} diff --git a/src/main/model/html/HtmlParser.java b/src/main/model/html/HtmlParser.java new file mode 100644 index 0000000..5109e62 --- /dev/null +++ b/src/main/model/html/HtmlParser.java @@ -0,0 +1,288 @@ +package model.html; + +import java.util.*; + +import model.html.HtmlTree; +import org.javatuples.*; + + +/* + + + + + j-james + + + + + + + +
+

+ j-james +

+ +
+
+
+ +
+

Hello, I'm JJ, and I go by j-james on the Internet.

+

I'm a second-year student at the University of British Columbia, flag hunter for Maple Bacon, embedded programmer on UBC Bionics, and occasional ultimate frisbee and roller/ice hockey player.

+

Outside of school, sports, and social life, I enjoy building and contributing to free-and-open-source projects. The majority of my work can either be found on GitHub or at SourceHut.

+
+
+ +
+
+ 2020-2022 j-james +
+ + +