aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/formats/html.nim107
-rw-r--r--src/gui/terminal.nim35
-rw-r--r--src/html.nim30
3 files changed, 142 insertions, 30 deletions
diff --git a/src/formats/html.nim b/src/formats/html.nim
new file mode 100644
index 0000000..8724295
--- /dev/null
+++ b/src/formats/html.nim
@@ -0,0 +1,107 @@
+import std/[strutils, sequtils, sugar, tables]
+
+# Todo:
+# - Handle implicit tags
+# - Handle comments
+# - Handle quoted attributes
+# - Ignore <> in <script> tags
+# - Transform parser into a state machine
+
+type NodeKind* = enum
+ Text, Element
+
+# Clever node implementation from callsamu and XmlNodeObj
+# Note that Text nodes are _only_ text.
+# ex. this <a>test</a> node is three nodes: "this ", " node", and the <a> tag
+type Node* {.acyclic.} = object
+ case kind*: NodeKind:
+ of Text:
+ text*: string
+ of Element:
+ tag*: string
+ attributes*: Table[string, string] # change
+ nested*: seq[Node]
+
+# Note that even plain text is valid HTML.
+type Html* = seq[Node]
+
+type ParserState = enum
+ InTag, InStyle, InScript
+
+const self_closing_tags = [
+ "area", "base", "br", "col", "embed", "hr", "img", "input",
+ "link", "meta", "param", "source", "track", "wbr",
+]
+
+const implicit_tags = [
+ "html", "head", "body"
+]
+
+const head_exclusive_tags = [
+ "base", "basefont", "bgsound", "head", "link",
+ "meta", "noscript", "style", "script", "title",
+]
+
+func attributes(attributes: seq[string]): Table[string, string] =
+ for i in attributes.map(x => x.split('=', maxsplit=1)):
+ # Silently ignore invalid attributes
+ if i.len != 2: debugEcho "Invalid attribute ", i
+ else: result[i[0].toLower] = i[1].strip(true, true, {'"'})
+
+func conclude(buffer: string, unfinished: var seq[Node], result: var Html) =
+ # We will render everything in Standards Mode.
+ if buffer.toLower != "!doctype html":
+ let split: seq[string] = buffer.strip(false, true, {'/'}).strip().split(' ')
+ let tag = split[0].toLower
+ let attributes = split[1..^1].attributes
+ let node = Node(kind: Element, tag: tag, attributes: attributes, nested: @[])
+
+ # If we're in a self-closing tag:
+ if tag in self_closing_tags:
+ # Add the element to the unfinished tags list
+ unfinished.add(node)
+ # If we're in a closing or self-closing tag:
+ if tag.len > 0 and tag[0] == '/' or tag in self_closing_tags:
+ # Add the element to the parent node
+ if unfinished.len > 1:
+ unfinished[^2].nested.add(unfinished.pop)
+ # Or, if there is no parent node, add the element to the result
+ else:
+ result.add(unfinished.pop)
+ # If we're in an opening tag:
+ else:
+ # Add tag to the unfinished tag list.
+ unfinished.add(node)
+
+func finish(unfinished: var seq[Node], result: var seq[Node]) =
+ while unfinished.len > 1:
+ unfinished[^2].nested.add(unfinished.pop)
+ if unfinished.len == 1:
+ result.add(unfinished.pop)
+
+# This implementation naively keeps track of opening/closing tags by order, not content.
+func parseHTML*(html: string): Html =
+ var in_tag = false
+ var buffer = ""
+ var unfinished: seq[Node] = @[]
+
+ for c in html:
+ # Beginning of a tag
+ if not in_tag and c == '<':
+ # Add the collected text content to the parent node, if there is text
+ if buffer.strip() != "":
+ unfinished[^1].nested.add(Node(kind: Text, text: buffer))
+ in_tag = true
+ buffer = ""
+ # End of a tag
+ elif in_tag and c == '>':
+ conclude(buffer, unfinished, result)
+ in_tag = false
+ buffer = ""
+ else:
+ buffer &= c
+ finish(unfinished, result)
+
+proc renderSource*(html: string) =
+ for i, c in html:
+ stdout.write(c)
diff --git a/src/gui/terminal.nim b/src/gui/terminal.nim
new file mode 100644
index 0000000..e893081
--- /dev/null
+++ b/src/gui/terminal.nim
@@ -0,0 +1,35 @@
+import std/[strutils, tables], ../formats/html
+
+proc print(node: Node, indent=0, raw=false) =
+ if node.kind == Element:
+ if not raw:
+ stdout.write(" ".repeat(indent))
+ stdout.write("<" & node.tag)
+ for attribute in node.attributes.pairs:
+ stdout.write(" " & attribute[0] & "=" & attribute[1])
+ stdout.write(">")
+ stdout.write('\n')
+ for i in node.nested:
+ i.print(indent+2)
+ if not raw:
+ stdout.write(" ".repeat(indent))
+ stdout.write("</" & node.tag & ">")
+ stdout.write('\n')
+ else:
+ stdout.write(" ".repeat(indent))
+ stdout.write(node.text)
+
+proc render*(html: Html) =
+ for node in html:
+ node.print()
+
+proc renderSource*(html: Html) =
+ for node in html:
+ node.print(0, true)
+
+when isMainModule:
+ import ../protocols/http, ../uri
+ let url = "https://example.org:443/index.html"
+ let request = httpRequest(parseUrl(url))
+ let parsed = parseHTML(request.body)
+ renderSource(parsed)
diff --git a/src/html.nim b/src/html.nim
deleted file mode 100644
index 4ae5ee1..0000000
--- a/src/html.nim
+++ /dev/null
@@ -1,30 +0,0 @@
-type Html = object
- tags: seq[string]
-
-type Tag {.acyclic.} = object
- name: string
- text: string
- nested: seq[Tag]
-
-func parseHTML(html: string): Html = discard
-
-# Todo: revamp parsing: keep track of tags, entities, etc
-proc renderHTML*(html: string) =
- var
- in_angle = false
- in_body = false
-
- # _Why_ is it i, c and not c, i...
- for i, c in html:
- if c == '<':
- in_angle = true
- if html[i..i+4] == "<body":
- in_body = true
- elif c == '>':
- in_angle = false
- elif not in_angle and in_body and c in {char(32)..char(126), '\n'}:
- stdout.write(c)
-
-proc renderSource*(html: string) =
- for i, c in html:
- stdout.write(c)