Basic HTML parsing and terminal rendering

author: j-james 2022-06-24 06:24:18 +0000
committer: j-james 2022-06-24 06:24:18 +0000
commit: 1b9287fcfb345aab4b31d7d6f4c0d330bd5e13f3 (patch)
tree: b78009bb59e88c14bf6b4e3bae2af13c921a8d0b /src/formats
parent: b629bff6211a18327d49d34eaee5a98471308030 (diff)
1 files changed, 107 insertions, 0 deletions
diff --git a/src/formats/html.nim b/src/formats/html.nim
new file mode 100644
index 0000000..8724295
--- /dev/null
+++ b/src/formats/html.nim
@@ -0,0 +1,107 @@
+import std/[strutils, sequtils, sugar, tables]
+
+# Todo:
+# - Handle implicit tags
+# - Handle comments
+# - Handle quoted attributes
+# - Ignore <> in <script> tags
+# - Transform parser into a state machine
+
+type NodeKind* = enum
+  Text, Element
+
+# Clever node implementation from callsamu and XmlNodeObj
+# Note that Text nodes are _only_ text.
+# ex. this <a>test</a> node is three nodes: "this ", " node", and the <a> tag
+type Node* {.acyclic.} = object
+  case kind*: NodeKind:
+    of Text:
+      text*: string
+    of Element:
+      tag*: string
+      attributes*: Table[string, string] # change
+      nested*: seq[Node]
+
+# Note that even plain text is valid HTML.
+type Html* = seq[Node]
+
+type ParserState = enum
+  InTag, InStyle, InScript
+
+const self_closing_tags = [
+  "area", "base", "br", "col", "embed", "hr", "img", "input",
+  "link", "meta", "param", "source", "track", "wbr",
+]
+
+const implicit_tags = [
+  "html", "head", "body"
+]
+
+const head_exclusive_tags = [
+  "base", "basefont", "bgsound", "head", "link",
+  "meta", "noscript", "style", "script", "title",
+]
+
+func attributes(attributes: seq[string]): Table[string, string] =
+  for i in attributes.map(x => x.split('=', maxsplit=1)):
+    # Silently ignore invalid attributes
+    if i.len != 2: debugEcho "Invalid attribute ", i
+    else: result[i[0].toLower] = i[1].strip(true, true, {'"'})
+
+func conclude(buffer: string, unfinished: var seq[Node], result: var Html) =
+  # We will render everything in Standards Mode.
+  if buffer.toLower != "!doctype html":
+    let split: seq[string] = buffer.strip(false, true, {'/'}).strip().split(' ')
+    let tag = split[0].toLower
+    let attributes = split[1..^1].attributes
+    let node = Node(kind: Element, tag: tag, attributes: attributes, nested: @[])
+
+    # If we're in a self-closing tag:
+    if tag in self_closing_tags:
+      # Add the element to the unfinished tags list
+      unfinished.add(node)
+    # If we're in a closing or self-closing tag:
+    if tag.len > 0 and tag[0] == '/' or tag in self_closing_tags:
+      # Add the element to the parent node
+      if unfinished.len > 1:
+        unfinished[^2].nested.add(unfinished.pop)
+      # Or, if there is no parent node, add the element to the result
+      else:
+        result.add(unfinished.pop)
+    # If we're in an opening tag:
+    else:
+      # Add tag to the unfinished tag list.
+      unfinished.add(node)
+
+func finish(unfinished: var seq[Node], result: var seq[Node]) =
+  while unfinished.len > 1:
+    unfinished[^2].nested.add(unfinished.pop)
+  if unfinished.len == 1:
+    result.add(unfinished.pop)
+
+# This implementation naively keeps track of opening/closing tags by order, not content.
+func parseHTML*(html: string): Html =
+  var in_tag = false
+  var buffer = ""
+  var unfinished: seq[Node] = @[]
+
+  for c in html:
+    # Beginning of a tag
+    if not in_tag and c == '<':
+      # Add the collected text content to the parent node, if there is text
+      if buffer.strip() != "":
+        unfinished[^1].nested.add(Node(kind: Text, text: buffer))
+      in_tag = true
+      buffer = ""
+    # End of a tag
+    elif in_tag and c == '>':
+      conclude(buffer, unfinished, result)
+      in_tag = false
+      buffer = ""
+    else:
+      buffer &= c
+  finish(unfinished, result)
+
+proc renderSource*(html: string) =
+  for i, c in html:
+    stdout.write(c)
author	j-james	2022-06-24 06:24:18 +0000
committer	j-james	2022-06-24 06:24:18 +0000
commit	1b9287fcfb345aab4b31d7d6f4c0d330bd5e13f3 (patch)
tree	b78009bb59e88c14bf6b4e3bae2af13c921a8d0b /src/formats
parent	b629bff6211a18327d49d34eaee5a98471308030 (diff)