src/formats/html.nim


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108

import std/[strutils, sequtils, sugar, tables]

# Todo:
# - Handle implicit tags
# - Handle comments
# - Handle quoted attributes
# - Ignore <> in <script> tags
# - Transform parser into a state machine

type NodeKind* = enum
  Text, Element

# Clever node implementation from callsamu and XmlNodeObj
# Note that Text nodes are _only_ text.
# ex. this <a>test</a> node is three nodes: "this ", " node", and the <a> tag
type Node* = ref object
  # parent*: Node # Unfortunately, we will have to deal with cycles.
  case kind*: NodeKind:
    of Text:
      text*: string
    of Element:
      tag*: string
      attributes*: Table[string, string] # change
      children*: seq[Node]

# Note that even plain text is valid HTML, in this implementation.
type Html* = seq[Node]

type ParserState = enum
  InTag, InStyle, InScript

const self_closing_tags = [
  "area", "base", "br", "col", "embed", "hr", "img", "input",
  "link", "meta", "param", "source", "track", "wbr",
]

const implicit_tags = [
  "html", "head", "body"
]

const head_exclusive_tags = [
  "base", "basefont", "bgsound", "head", "link",
  "meta", "noscript", "style", "script", "title",
]

func attributes(attributes: seq[string]): Table[string, string] =
  for i in attributes.map(x => x.split('=', maxsplit=1)):
    # Silently ignore invalid attributes
    if i.len != 2: debugEcho "Invalid attribute ", i
    else: result[i[0].toLower] = i[1].strip(true, true, {'"'})

func conclude(buffer: string, unfinished: var seq[Node], result: var Html) =
  # We will render everything in Standards Mode.
  if buffer.toLower != "!doctype html":
    let split: seq[string] = buffer.strip(false, true, {'/'}).strip().split(' ')
    let tag = split[0].toLower
    let attributes = split[1..^1].attributes
    let node = Node(kind: Element, tag: tag, attributes: attributes, children: @[])

    # If we're in a self-closing tag:
    if tag in self_closing_tags:
      # Add the element to the unfinished tags list
      unfinished.add(node)
    # If we're in a closing or self-closing tag:
    if tag.len > 0 and tag[0] == '/' or tag in self_closing_tags:
      # Add the element to the parent node
      if unfinished.len > 1:
        unfinished[^2].children.add(unfinished.pop)
      # Or, if there is no parent node, add the element to the result
      else:
        result.add(unfinished.pop)
    # If we're in an opening tag:
    else:
      # Add tag to the unfinished tag list.
      unfinished.add(node)

func finish(unfinished: var seq[Node], result: var seq[Node]) =
  while unfinished.len > 1:
    unfinished[^2].children.add(unfinished.pop)
  if unfinished.len == 1:
    result.add(unfinished.pop)

# This implementation naively keeps track of opening/closing tags by order, not content.
func parseHTML*(html: string): Html =
  var in_tag = false
  var buffer = ""
  var unfinished: seq[Node] = @[]

  for c in html:
    # Beginning of a tag
    if not in_tag and c == '<':
      # Add the collected text content to the parent node, if there is text
      if buffer.strip() != "":
        unfinished[^1].children.add(Node(kind: Text, text: buffer))
      in_tag = true
      buffer = ""
    # End of a tag
    elif in_tag and c == '>':
      conclude(buffer, unfinished, result)
      in_tag = false
      buffer = ""
    else:
      buffer &= c
  finish(unfinished, result)

proc renderSource*(html: string) =
  for i, c in html:
    stdout.write(c)