diff options
author | j-james | 2022-06-24 06:24:18 +0000 |
---|---|---|
committer | j-james | 2022-06-24 06:24:18 +0000 |
commit | 1b9287fcfb345aab4b31d7d6f4c0d330bd5e13f3 (patch) | |
tree | b78009bb59e88c14bf6b4e3bae2af13c921a8d0b /src | |
parent | b629bff6211a18327d49d34eaee5a98471308030 (diff) |
Basic HTML parsing and terminal rendering
Diffstat (limited to 'src')
-rw-r--r-- | src/formats/html.nim | 107 | ||||
-rw-r--r-- | src/gui/terminal.nim | 35 | ||||
-rw-r--r-- | src/html.nim | 30 |
3 files changed, 142 insertions, 30 deletions
diff --git a/src/formats/html.nim b/src/formats/html.nim new file mode 100644 index 0000000..8724295 --- /dev/null +++ b/src/formats/html.nim @@ -0,0 +1,107 @@ +import std/[strutils, sequtils, sugar, tables] + +# Todo: +# - Handle implicit tags +# - Handle comments +# - Handle quoted attributes +# - Ignore <> in <script> tags +# - Transform parser into a state machine + +type NodeKind* = enum + Text, Element + +# Clever node implementation from callsamu and XmlNodeObj +# Note that Text nodes are _only_ text. +# ex. this <a>test</a> node is three nodes: "this ", " node", and the <a> tag +type Node* {.acyclic.} = object + case kind*: NodeKind: + of Text: + text*: string + of Element: + tag*: string + attributes*: Table[string, string] # change + nested*: seq[Node] + +# Note that even plain text is valid HTML. +type Html* = seq[Node] + +type ParserState = enum + InTag, InStyle, InScript + +const self_closing_tags = [ + "area", "base", "br", "col", "embed", "hr", "img", "input", + "link", "meta", "param", "source", "track", "wbr", +] + +const implicit_tags = [ + "html", "head", "body" +] + +const head_exclusive_tags = [ + "base", "basefont", "bgsound", "head", "link", + "meta", "noscript", "style", "script", "title", +] + +func attributes(attributes: seq[string]): Table[string, string] = + for i in attributes.map(x => x.split('=', maxsplit=1)): + # Silently ignore invalid attributes + if i.len != 2: debugEcho "Invalid attribute ", i + else: result[i[0].toLower] = i[1].strip(true, true, {'"'}) + +func conclude(buffer: string, unfinished: var seq[Node], result: var Html) = + # We will render everything in Standards Mode. + if buffer.toLower != "!doctype html": + let split: seq[string] = buffer.strip(false, true, {'/'}).strip().split(' ') + let tag = split[0].toLower + let attributes = split[1..^1].attributes + let node = Node(kind: Element, tag: tag, attributes: attributes, nested: @[]) + + # If we're in a self-closing tag: + if tag in self_closing_tags: + # Add the element to the unfinished tags list + unfinished.add(node) + # If we're in a closing or self-closing tag: + if tag.len > 0 and tag[0] == '/' or tag in self_closing_tags: + # Add the element to the parent node + if unfinished.len > 1: + unfinished[^2].nested.add(unfinished.pop) + # Or, if there is no parent node, add the element to the result + else: + result.add(unfinished.pop) + # If we're in an opening tag: + else: + # Add tag to the unfinished tag list. + unfinished.add(node) + +func finish(unfinished: var seq[Node], result: var seq[Node]) = + while unfinished.len > 1: + unfinished[^2].nested.add(unfinished.pop) + if unfinished.len == 1: + result.add(unfinished.pop) + +# This implementation naively keeps track of opening/closing tags by order, not content. +func parseHTML*(html: string): Html = + var in_tag = false + var buffer = "" + var unfinished: seq[Node] = @[] + + for c in html: + # Beginning of a tag + if not in_tag and c == '<': + # Add the collected text content to the parent node, if there is text + if buffer.strip() != "": + unfinished[^1].nested.add(Node(kind: Text, text: buffer)) + in_tag = true + buffer = "" + # End of a tag + elif in_tag and c == '>': + conclude(buffer, unfinished, result) + in_tag = false + buffer = "" + else: + buffer &= c + finish(unfinished, result) + +proc renderSource*(html: string) = + for i, c in html: + stdout.write(c) diff --git a/src/gui/terminal.nim b/src/gui/terminal.nim new file mode 100644 index 0000000..e893081 --- /dev/null +++ b/src/gui/terminal.nim @@ -0,0 +1,35 @@ +import std/[strutils, tables], ../formats/html + +proc print(node: Node, indent=0, raw=false) = + if node.kind == Element: + if not raw: + stdout.write(" ".repeat(indent)) + stdout.write("<" & node.tag) + for attribute in node.attributes.pairs: + stdout.write(" " & attribute[0] & "=" & attribute[1]) + stdout.write(">") + stdout.write('\n') + for i in node.nested: + i.print(indent+2) + if not raw: + stdout.write(" ".repeat(indent)) + stdout.write("</" & node.tag & ">") + stdout.write('\n') + else: + stdout.write(" ".repeat(indent)) + stdout.write(node.text) + +proc render*(html: Html) = + for node in html: + node.print() + +proc renderSource*(html: Html) = + for node in html: + node.print(0, true) + +when isMainModule: + import ../protocols/http, ../uri + let url = "https://example.org:443/index.html" + let request = httpRequest(parseUrl(url)) + let parsed = parseHTML(request.body) + renderSource(parsed) diff --git a/src/html.nim b/src/html.nim deleted file mode 100644 index 4ae5ee1..0000000 --- a/src/html.nim +++ /dev/null @@ -1,30 +0,0 @@ -type Html = object - tags: seq[string] - -type Tag {.acyclic.} = object - name: string - text: string - nested: seq[Tag] - -func parseHTML(html: string): Html = discard - -# Todo: revamp parsing: keep track of tags, entities, etc -proc renderHTML*(html: string) = - var - in_angle = false - in_body = false - - # _Why_ is it i, c and not c, i... - for i, c in html: - if c == '<': - in_angle = true - if html[i..i+4] == "<body": - in_body = true - elif c == '>': - in_angle = false - elif not in_angle and in_body and c in {char(32)..char(126), '\n'}: - stdout.write(c) - -proc renderSource*(html: string) = - for i, c in html: - stdout.write(c) |