From 3e9bb5fae16c35938bc1f7f7669c12cc355c9331 Mon Sep 17 00:00:00 2001 From: j-james Date: Sun, 16 Oct 2022 23:25:45 -0700 Subject: Basic prototypes of HTML/CSS lexers --- src/main/model/html/HtmlLexer.java | 68 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 src/main/model/html/HtmlLexer.java (limited to 'src/main/model/html') diff --git a/src/main/model/html/HtmlLexer.java b/src/main/model/html/HtmlLexer.java new file mode 100644 index 0000000..8cad425 --- /dev/null +++ b/src/main/model/html/HtmlLexer.java @@ -0,0 +1,68 @@ +package model.html; + +import java.util.ArrayList; + +/** + * We'll tokenize HTML by tags: disregarding the contents of the tag and attributes within the tag. + * The file is also considered to be free-form here: whitespace duplicates are disregarded. + */ +public class HtmlLexer { + + // Takes a String of raw HTML, and tokenizes it for our parser. + public static ArrayList lex(String input) { + String token = ""; + ArrayList tokens = new ArrayList<>(); + boolean inTag = false; + boolean inSingleQuotes = false; + boolean inDoubleQuotes = false; + + for (char i : input.toCharArray()) { + token += i; + switch (i) { + case '<': + if (!inSingleQuotes && !inDoubleQuotes) { + inTag = true; + if (!token.equals("<")) { + tokens.add(token.substring(0, token.length() - 1)); + token = "<"; + } + } else if (inTag) { + System.out.printf("Probably failing parser"); + } + break; + case '>': + if (!inSingleQuotes && !inDoubleQuotes) { + if (!inTag) { + System.out.printf("Probably failing parser"); + } + inTag = false; + tokens.add(token); + token = ""; + } + break; + case '"': + if (!inSingleQuotes) { + inDoubleQuotes = !inDoubleQuotes; + } + break; + case '\'': + if (!inDoubleQuotes) { + inSingleQuotes = !inSingleQuotes; + } + break; + } + } + /** + * When lexing invalid HTML: we may end up with trailing garbage: either an unfinished tag or extra text + * (those are the only two options since this is just the lex step) + */ + if (!token.equals("")) { + if (inTag) { + tokens.add(token + ">"); + } else { + tokens.add(token); + } + } + return tokens; + } +} -- cgit v1.2.3-70-g09d2