diff options
author | j-james | 2022-10-17 06:25:45 +0000 |
---|---|---|
committer | j-james | 2022-10-17 06:27:55 +0000 |
commit | 3e9bb5fae16c35938bc1f7f7669c12cc355c9331 (patch) | |
tree | 82e1ab837579e7762071ea97c064c0750a38c106 /src/main/model/html | |
parent | 0845be5ec0215fb43f9dbdef00b22a733d4080b3 (diff) |
Basic prototypes of HTML/CSS lexers
Diffstat (limited to 'src/main/model/html')
-rw-r--r-- | src/main/model/html/HtmlLexer.java | 68 |
1 files changed, 68 insertions, 0 deletions
diff --git a/src/main/model/html/HtmlLexer.java b/src/main/model/html/HtmlLexer.java new file mode 100644 index 0000000..8cad425 --- /dev/null +++ b/src/main/model/html/HtmlLexer.java @@ -0,0 +1,68 @@ +package model.html; + +import java.util.ArrayList; + +/** + * We'll tokenize HTML by tags: disregarding the contents of the tag and attributes within the tag. + * The file is also considered to be free-form here: whitespace duplicates are disregarded. + */ +public class HtmlLexer { + + // Takes a String of raw HTML, and tokenizes it for our parser. + public static ArrayList<String> lex(String input) { + String token = ""; + ArrayList<String> tokens = new ArrayList<>(); + boolean inTag = false; + boolean inSingleQuotes = false; + boolean inDoubleQuotes = false; + + for (char i : input.toCharArray()) { + token += i; + switch (i) { + case '<': + if (!inSingleQuotes && !inDoubleQuotes) { + inTag = true; + if (!token.equals("<")) { + tokens.add(token.substring(0, token.length() - 1)); + token = "<"; + } + } else if (inTag) { + System.out.printf("Probably failing parser"); + } + break; + case '>': + if (!inSingleQuotes && !inDoubleQuotes) { + if (!inTag) { + System.out.printf("Probably failing parser"); + } + inTag = false; + tokens.add(token); + token = ""; + } + break; + case '"': + if (!inSingleQuotes) { + inDoubleQuotes = !inDoubleQuotes; + } + break; + case '\'': + if (!inDoubleQuotes) { + inSingleQuotes = !inSingleQuotes; + } + break; + } + } + /** + * When lexing invalid HTML: we may end up with trailing garbage: either an unfinished tag or extra text + * (those are the only two options since this is just the lex step) + */ + if (!token.equals("")) { + if (inTag) { + tokens.add(token + ">"); + } else { + tokens.add(token); + } + } + return tokens; + } +} |