Basic prototypes of HTML/CSS lexers

author: j-james 2022-10-17 06:25:45 +0000
committer: j-james 2022-10-17 06:27:55 +0000
commit: 3e9bb5fae16c35938bc1f7f7669c12cc355c9331 (patch)
tree: 82e1ab837579e7762071ea97c064c0750a38c106 /src
parent: 0845be5ec0215fb43f9dbdef00b22a733d4080b3 (diff)
8 files changed, 360 insertions, 12 deletions
diff --git a/src/main/model/MyModel.java b/src/main/model/MyModel.java
deleted file mode 100644
index f9a3dd7..0000000
--- a/src/main/model/MyModel.java
+++ /dev/null
@@ -1,5 +0,0 @@
-package model;
-
-public class MyModel {
-    // delete or rename this class!
-}
diff --git a/src/main/model/css/CssLexer.java b/src/main/model/css/CssLexer.java
new file mode 100644
index 0000000..657d3e1
--- /dev/null
+++ b/src/main/model/css/CssLexer.java
@@ -0,0 +1,63 @@
+package model.css;
+
+import java.util.ArrayList;
+
+/**
+ * This lexer splits an input by whitespace, brackets, and semicolons.
+ * Brackets and semicolons are included in the lexed output, whitespace is not.
+ * <br>
+ * CSS, thankfully, is far more rigid and less-forgiving of errors than HTMl.
+ * It also has multiple layers of fallback for errors: ranging from: "ignore this
+ * property", to "ignore this rule", to "this isn't fucking CSS" and ignore it all.
+ * <br>
+ * Still, even though we don't have to deal with garbage like escaped quotes (future edit: whoops, yes we do) and
+ * what not, we'll still implement our lexer with a for loop instead of split() for future optimizations.
+ */
+public class CssLexer {
+
+    public static ArrayList<String> lex(String input) {
+        String token = "";
+        ArrayList<String> tokens = new ArrayList<>();
+        boolean inSingleQuotes = false;
+        boolean inDoubleQuotes = false;
+        char previous = '\0';
+
+        for (char i : input.toCharArray()) {
+            // i HATE fallthrough switch statements
+            switch (i) {
+                case '{': case '}': case ';': case ':':
+                case ' ': case '\n': case '\t':
+                    if (!inSingleQuotes && !inDoubleQuotes) {
+                        if (!token.equals("")) {
+                            tokens.add(token);
+                            token = "";
+                        }
+                        switch (i) {
+                            case '{': case '}': case ';': case ':':
+                                tokens.add(Character.toString(i));
+                                break;
+                            case ' ': case '\n': case '\t':
+                                break;
+                        }
+                    } else {
+                        token += i;
+                    }
+                    break;
+                // intentional use of footgun behavior
+                case '"':
+                    if (previous != '\\') {
+                        inDoubleQuotes = !inDoubleQuotes;
+                    }
+                case '\'':
+                    if (previous != '\\') {
+                        inSingleQuotes = !inSingleQuotes;
+                    }
+                default:
+                    token += i;
+                    break;
+            }
+            previous = i;
+        }
+        return tokens;
+    }
+}
diff --git a/src/main/model/html/HtmlLexer.java b/src/main/model/html/HtmlLexer.java
new file mode 100644
index 0000000..8cad425
--- /dev/null
+++ b/src/main/model/html/HtmlLexer.java
@@ -0,0 +1,68 @@
+package model.html;
+
+import java.util.ArrayList;
+
+/**
+ * We'll tokenize HTML by tags: disregarding the contents of the tag and attributes within the tag.
+ * The file is also considered to be free-form here: whitespace duplicates are disregarded.
+ */
+public class HtmlLexer {
+
+    // Takes a String of raw HTML, and tokenizes it for our parser.
+    public static ArrayList<String> lex(String input) {
+        String token = "";
+        ArrayList<String> tokens = new ArrayList<>();
+        boolean inTag = false;
+        boolean inSingleQuotes = false;
+        boolean inDoubleQuotes = false;
+
+        for (char i : input.toCharArray()) {
+            token += i;
+            switch (i) {
+                case '<':
+                    if (!inSingleQuotes && !inDoubleQuotes) {
+                        inTag = true;
+                        if (!token.equals("<")) {
+                            tokens.add(token.substring(0, token.length() - 1));
+                            token = "<";
+                        }
+                    } else if (inTag) {
+                        System.out.printf("Probably failing parser");
+                    }
+                    break;
+                case '>':
+                    if (!inSingleQuotes && !inDoubleQuotes) {
+                        if (!inTag) {
+                            System.out.printf("Probably failing parser");
+                        }
+                        inTag = false;
+                        tokens.add(token);
+                        token = "";
+                    }
+                    break;
+                case '"':
+                    if (!inSingleQuotes) {
+                        inDoubleQuotes = !inDoubleQuotes;
+                    }
+                    break;
+                case '\'':
+                    if (!inDoubleQuotes) {
+                        inSingleQuotes = !inSingleQuotes;
+                    }
+                    break;
+            }
+        }
+        /**
+         * When lexing invalid HTML: we may end up with trailing garbage: either an unfinished tag or extra text
+         * (those are the only two options since this is just the lex step)
+         */
+        if (!token.equals("")) {
+            if (inTag) {
+                tokens.add(token + ">");
+            } else {
+                tokens.add(token);
+            }
+        }
+        return tokens;
+    }
+}
diff --git a/src/main/model/util/AbstractTree.java b/src/main/model/util/AbstractTree.java
new file mode 100644
index 0000000..4c74732
--- /dev/null
+++ b/src/main/model/util/AbstractTree.java
@@ -0,0 +1,35 @@
+package model.util;
+
+import org.javatuples.*;
+
+import java.util.*;
+
+// Utility class for a general tree: we'll be using these a lot
+public abstract class AbstractTree<T> {
+
+    // An AbstractTree holds some kind of data; we'll want this to be generic
+    // e.g. a tag, attributes, a tag and attributes, etc
+    private T data;
+    // Since it's a tree every node also has children.
+    private ArrayList<AbstractTree<T>> children;
+
+    // future implementations may want to consider adding an Optional<> parent; or an Optional<> prevSibling
+
+    public T getData() {
+        return data;
+    }
+
+    public ArrayList<AbstractTree<T>> getChildren() {
+        return children;
+    }
+
+    // god so much boilerplate
+    public AbstractTree(T data, ArrayList<AbstractTree<T>> children) {
+        this.data = data;
+        this.children = children;
+    }
+
+    public void addChild(AbstractTree<T> child) {
+        this.children.add(child);
+    }
+}
diff --git a/src/main/model/util/Lexer.java b/src/main/model/util/Lexer.java
new file mode 100644
index 0000000..b35caa6
--- /dev/null
+++ b/src/main/model/util/Lexer.java
@@ -0,0 +1,58 @@
+package model.util;
+
+import java.util.*;
+
+// General-purpose Lexer
+public class Lexer {
+
+    // private static final Set<String> whitespace = new HashSet<String>(" ", "\n");
+
+    // unused, helper function for if we implement finding identifers longer than a character
+    private static int longestDelimiter(Set<String> delimiters) {
+        int longestDelimiter = 0;
+        for (String delimiter : delimiters) {
+            if (delimiter.length() > longestDelimiter) {
+                longestDelimiter = delimiter.length();
+            }
+        }
+        return longestDelimiter;
+    }
+
+    /**
+     * Lexes a "free-form" language. "free-form" has a specific meaning here that's important to preserve:
+     * "free-form" means that _additional_ whitespace characters do not affect the language: e.g. two newlines
+     * instead of one, four spaces instead of two, etc. They are _not_ "whitespace-insensitive", which is usually
+     * a misnomer.
+     * The name's a bit of a joke: free-form languages are generally referred to as whitespace-insensitive -->
+     * insensitive == rude. Jokes are funnier when you have to explain them.
+     * Also, insensitiveLex() and freeformLex() aren't really that good of names.
+     *
+     * NOTE: This lexer only works with single-character deliminators.
+     * TODO: deduplicate whitespace
+     */
+    // public static ArrayList<String> rudeLex(String input, Set<Character> delimiters) {}
+
+    /**
+     * We might as well implement a lexer for non-free-form languages, but whatever. We won't use it.
+     */
+    public static ArrayList<String> sensitiveLex(String input, Set<Character> delimiters) {
+        // int longestDelimiter = longestDelimiter(delimiters);
+
+        ArrayList<String> tokens = new ArrayList<String>();
+        String currentToken = "";
+        // terrible c-style for loop because we may need to manipulate the index in the future
+        for (int i = 0; i < input.length(); i++) {
+            char nextToken = input.charAt(i);
+            if (delimiters.contains(nextToken)) {
+                if (!currentToken.equals("")) {
+                    tokens.add(currentToken);
+                }
+                tokens.add(Character.toString(nextToken));
+                currentToken = "";
+            } else {
+                currentToken += input.charAt(i);
+            }
+        }
+        return tokens;
+    }
+}
diff --git a/src/test/model/CssLexerTest.java b/src/test/model/CssLexerTest.java
new file mode 100644
index 0000000..4ed28e2
--- /dev/null
+++ b/src/test/model/CssLexerTest.java
@@ -0,0 +1,67 @@
+package model;
+
+import model.css.CssLexer;
+import org.junit.jupiter.api.Test;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Arrays;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+class CssLexerTest {
+
+    @Test
+    void testIdiomaticHtml() {
+        try {
+            String idiomaticCss = Files.readString(Path.of("data/example.css"));
+            String[] expected = {"body", "{", "background-color", ":", "#f0f0f2", ";", "margin", ":", "0", ";", "padding", ":", "0", ";", "font-family", ":", "-apple-system,", "system-ui,", "BlinkMacSystemFont,", "\"Segoe UI\",", "\"Open Sans\",", "\"Helvetica Neue\",", "Helvetica,", "Arial,", "sans-serif", ";", "}", "div", "{", "width", ":", "600px", ";", "margin", ":", "5em", "auto", ";", "padding", ":", "2em", ";", "background-color", ":", "#fdfdff", ";", "border-radius", ":", "0.5em", ";", "box-shadow", ":", "2px", "3px", "7px", "2px", "rgba(0,0,0,0.02)", ";", "}", "a", ":", "link,", "a", ":", "visited", "{", "color", ":", "#38488f", ";", "text-decoration", ":", "none", ";", "}", "@media", "(max-width", ":", "700px)", "{", "div", "{", "margin", ":", "0", "auto", ";", "width", ":", "auto", ";", "}", "}"};
+
+            assertEquals(CssLexer.lex(idiomaticCss), Arrays.asList(expected));
+            for (String i : CssLexer.lex(idiomaticCss)) {
+                System.out.print("\"");
+                System.out.print(i);
+                System.out.print("\", ");
+            }
+        } catch (IOException e) {
+            System.out.printf("fuck %s\n", e.toString());
+            System.out.println(System.getProperty("user.dir"));
+        }
+    }
+/**
+    FoodServicesCard c1;
+    FoodServicesCard c2;
+    FoodServicesCard c3;
+
+    @BeforeEach
+    void runBefore() {
+        c1 = new FoodServicesCard(0);
+        c2 = new FoodServicesCard(100);
+        c3 = new FoodServicesCard(2000);
+    }
+
+    @Test
+    void testReloadingAndPurchasing() {
+        assertFalse(c1.makePurchase(100));
+        assertEquals(c1.getBalance(), 0);
+        c2.reload(10);
+        assertEquals(c2.getBalance(), 110);
+        assertTrue(c3.makePurchase(1400));
+        assertEquals(c3.getBalance(), 600);
+    }
+
+    @Test
+    void testRewardPoints() {
+        if (c1.makePurchase(c1.POINTS_NEEDED_FOR_CASH_BACK / 2)) {
+            assertEquals(c1.getRewardPoints(), (c1.POINTS_NEEDED_FOR_CASH_BACK / 2));
+        } else {
+            assertEquals(c1.getRewardPoints(), 0);
+        }
+        c2.makePurchase(c2.POINTS_NEEDED_FOR_CASH_BACK);
+        assertEquals(c2.getRewardPoints(), 0);
+        c3.makePurchase(1200);
+        assertEquals(c3.getRewardPoints(), 1200 % c3.POINTS_NEEDED_FOR_CASH_BACK);
+    }
+    */
+}
+\ No newline at end of file
diff --git a/src/test/model/HtmlLexerTest.java b/src/test/model/HtmlLexerTest.java
new file mode 100644
index 0000000..9dd5574
--- /dev/null
+++ b/src/test/model/HtmlLexerTest.java
@@ -0,0 +1,69 @@
+package model;
+
+import model.html.HtmlLexer;
+
+import org.junit.jupiter.api.*;
+
+import java.util.Arrays;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+class HtmlLexerTest {
+    String idiomaticHtml = "<!DOCTYPE html><html><head></head><body><p>Hello,world!</p></body></html>";
+    String brokenHtml = "<html><foo><bar></bar><ba";
+    String trailingTextHtml = "<html><foo><bar></bar>ba";
+
+    @Test
+    void testIdiomaticHtml() {
+        String[] idiomaticHtmlArray = {"<!DOCTYPE html>","<html>","<head>","</head>","<body>","<p>","Hello,world!","</p>","</body>","</html>"};
+        assertEquals(HtmlLexer.lex(idiomaticHtml), Arrays.asList(idiomaticHtmlArray));
+    }
+
+    @Test
+    void testBrokenHtml() {
+        String[] brokenHtmlArray = {"<html>","<foo>","<bar>","</bar>","<ba>"};
+        assertEquals(HtmlLexer.lex(brokenHtml), Arrays.asList(brokenHtmlArray));
+    }
+
+    @Test
+    void testTrailingTextHtml() {
+        String[] trailingTextHtmlArray = {"<html>","<foo>","<bar>","</bar>","ba"};
+        assertEquals(HtmlLexer.lex(trailingTextHtml), Arrays.asList(trailingTextHtmlArray));
+    }
+
+/**
+    FoodServicesCard c1;
+    FoodServicesCard c2;
+    FoodServicesCard c3;
+
+    @BeforeEach
+    void runBefore() {
+        c1 = new FoodServicesCard(0);
+        c2 = new FoodServicesCard(100);
+        c3 = new FoodServicesCard(2000);
+    }
+
+    @Test
+    void testReloadingAndPurchasing() {
+        assertFalse(c1.makePurchase(100));
+        assertEquals(c1.getBalance(), 0);
+        c2.reload(10);
+        assertEquals(c2.getBalance(), 110);
+        assertTrue(c3.makePurchase(1400));
+        assertEquals(c3.getBalance(), 600);
+    }
+
+    @Test
+    void testRewardPoints() {
+        if (c1.makePurchase(c1.POINTS_NEEDED_FOR_CASH_BACK / 2)) {
+            assertEquals(c1.getRewardPoints(), (c1.POINTS_NEEDED_FOR_CASH_BACK / 2));
+        } else {
+            assertEquals(c1.getRewardPoints(), 0);
+        }
+        c2.makePurchase(c2.POINTS_NEEDED_FOR_CASH_BACK);
+        assertEquals(c2.getRewardPoints(), 0);
+        c3.makePurchase(1200);
+        assertEquals(c3.getRewardPoints(), 1200 % c3.POINTS_NEEDED_FOR_CASH_BACK);
+    }
+    */
+}
+\ No newline at end of file
diff --git a/src/test/model/MyModelTest.java b/src/test/model/MyModelTest.java
deleted file mode 100644
index c41f32e..0000000
--- a/src/test/model/MyModelTest.java
+++ /dev/null
@@ -1,7 +0,0 @@
-package model;
-
-import static org.junit.jupiter.api.Assertions.*;
-
-class MyModelTest {
-    // delete or rename this class!
-}
-\ No newline at end of file
author	j-james	2022-10-17 06:25:45 +0000
committer	j-james	2022-10-17 06:27:55 +0000
commit	3e9bb5fae16c35938bc1f7f7669c12cc355c9331 (patch)
tree	82e1ab837579e7762071ea97c064c0750a38c106 /src
parent	0845be5ec0215fb43f9dbdef00b22a733d4080b3 (diff)