src/formats/uri.nim


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182

import std/strutils

# https://datatracker.ietf.org/doc/html/rfc3986
type Url* = object
  scheme*: string    # :
  authority*: string # //
  userinfo*: string  # @
  host*: string      # .
  port*: int         # :
  path*: string      # /
  query*: string     # ?
  fragment*: string  # #

const
  gendelims* = {':', '/', '?', '#', '[', ']', '@'}
  subdelims* = {'!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '='}
  reserved* = gendelims + subdelims
  unreserved* = {'a'..'z', 'A'..'Z', '0'..'9', '-', '.', '_', '~'}
  otherchars* = {'"', '%', '<', '>', '\\', '^', '`', '{', '|', '}', } # plus, whitespace and control characters

# This might be spec-compliant.
func parseURL*(url: string): Url =
  # Each URI must begin with a scheme name.
  # Scheme names consist of a sequence of characters beginning with a letter,
  # and followed by any combination of letters, digits, plus, period, or hyphen.
  var split = url.split(':', maxsplit=1)
  if split.len == 2:
    # Although schemes are case-insensitive, an implementation should produce
    # lowercase scheme names for consistency.
    result.scheme = split[0].toLower
    assert allCharsInSet(result.scheme, {'a'..'z', '0'..'9', '+', '-', '.'})

    # The authority component is preceded by a double slash, and is terminated by
    # the next slash, question mark, or pound sign, or by the end of the URI.
    if split[1][0..1] == "//":
      result.authority = split[1][2..^1].split({'/', '?', '#'}, maxsplit=1)[0]
      split[1] = split[1][result.authority.len + 2 ..< split[1].len]

    # A fragment identifier component is indicate by the presense of a pound sign
    # character, and terminated by the end of the URI.
    split = split[1].split('#', maxsplit=1)
    if split.len == 2:
      result.fragment = split[1]

    # The query component is indicated by the first question mark character,
    # and terminated by a pound sign or the end of the URI.
    split = split[0].split('?', maxsplit=1)
    if split.len == 2:
      result.query = split[1]

    # Todo: path parsing is actually more complex. this is cheating.
    if split[0] != "":
      result.path = "/" & split[0].strip(leading=true, trailing=false, {'/'})

    if result.authority != "":
      # The port subcomponent, if present, is designated by an optional port number
      # in decimal following the host and deliminated from it by a single colon.
      split = result.authority.rsplit({':'}, maxsplit=1)
      if split.len == 2:
        # If we accidentally parsed an IPv6 address: reconstruct
        if ']' in split[1]:
          split[0] = split[0] & ":" & split[1]
          split[1] = ""
        else:
          result.port = split[1].parseInt
          assert allCharsInSet(split[1], {'0'..'9'})

      # The userinfo subcomponent, if present, is followed by an at-sign
      # that delimits it from the host.
      split = split[0].split('@', maxsplit=1)
      if split.len == 2:
        result.userinfo = split[0]
        # Todo: parse hosts
        result.host = split[1]
      else:
        result.host = split[0]
      # Todo: is this assert right?
      assert allCharsInSet(result.host, unreserved + subdelims + {':', '[', ']'})
  else:
    raise newException(RangeDefect, "URL does not begin with a scheme")

  #[ Todo: advanced resolution and relative references
  # A relative reference takes advantage of the hierarchical syntax to express
  # a reference relative to the name space of another hierarchical URI.
  if url.len == 0:
    raise newException(RangeDefect, "URL is an empty string")
  # Relative path reference
  elif url[0] == '.':
    return
  # Network path reference
  elif url.len > 1 and url[0..1] == "//":
    return
  # Absolute path reference
  elif url[0] == '/':
    discard url.split('/', maxsplit=1)
  ]#

#[
# This is compact. But it's probably not spec-compliant.
func parseURL*(url: string): Url =
  # https://google.com/path?query#fragment
  # https, //google.com/path?query#fragment
  var split: seq[string] = url.split(':', maxsplit=1)
  if split.len == 2:
    result.scheme = split[0]

    # //google.com/path?query, fragment
    split = split[1].rsplit('#', maxsplit=1)
    if split.len == 2:
      result.fragment = split[1]

    # //google.com/path, query
    # //google.com/, query
    split = split[0].rsplit('?', maxsplit=1)
    if split.len == 2:
      result.query = split[1]

    # //google.com/path
    # //google.com
    # somethingelse
    if split[0][0..1] == "//":
      split = split[0].strip(leading=true, trailing=false, {'/'}).split('/', maxsplit=1)
      result.authority = split[0]
      if split.len == 2:
        result.path = split[1]
    else:
      # Note: This is a weird state.
      result.path = split[0].strip(leading=true, trailing=true, {'/'})
      debugEcho split[0]

    # userinfo@host:port
    # userinfo, host:port
    var remainder = result.authority
    split = result.authority.split('@', maxsplit=1)
    if split.len == 2:
      result.userinfo = split[0]
      remainder = split[1]

    # host, port
    split = remainder.rsplit(':', maxsplit=1)
    remainder = split[0]
    if split.len == 2:
      result.port = split[1].parseInt

    result.host = split[0]

  else:
    raise newException(OSError, "URL does not contain a colon")
]#

# assert parseURL("google.com") == Url(authority: "google.com", host: "google.com")
# assert parseURL("http://") == Url(scheme: "http")
assert parseURL("https://google.com") == Url(scheme: "https", authority: "google.com", host: "google.com")
assert parseURL("https://google.com/path//path") == Url(scheme: "https", authority: "google.com", host: "google.com", path: "/path//path")
assert parseURL("https://google.com/path?query") == Url(scheme: "https", authority: "google.com", host: "google.com", path: "/path", query: "query")
assert parseURL("https://google.com/path/?query") == Url(scheme: "https", authority: "google.com", host: "google.com", path: "/path/", query: "query")
assert parseURL("https://google.com?query") == Url(scheme: "https", authority: "google.com", host: "google.com", query: "query")
assert parseURL("https://google.com/path#fragment") == Url(scheme: "https", authority: "google.com", host: "google.com", path: "/path", fragment: "fragment")
assert parseURL("https://google.com/path#fragment?query") == Url(scheme: "https", authority: "google.com", host: "google.com", path: "/path", fragment: "fragment?query")
assert parseURL("https://192.168.0.1/path#fragment?query") == Url(scheme: "https", authority: "192.168.0.1", host: "192.168.0.1", path: "/path", fragment: "fragment?query")
assert parseURL("https://12:12:12:12/path#fragment?query") == Url(scheme: "https", authority: "12:12:12:12", host: "12:12:12", port: 12, path: "/path", fragment: "fragment?query")
assert parseURL("https://[12:213:213:fr:12]:1200/path#fragment?query") == Url(scheme: "https", authority: "[12:213:213:fr:12]:1200", host: "[12:213:213:fr:12]", port: 1200, path: "/path", fragment: "fragment?query")
assert parseURL("https://userinfo@host.com:8080/path#fragment?query") == Url(scheme: "https", authority: "userinfo@host.com:8080", userinfo: "userinfo", host: "host.com", port: 8080, path: "/path", fragment: "fragment?query")

assert parseURL("ftp://ftp.is.co.za/rfc/rfc1808.txt") == Url(scheme: "ftp", authority: "ftp.is.co.za", host: "ftp.is.co.za", path: "/rfc/rfc1808.txt")
assert parseURL("http://www.ietf.org/rfc/rfc2396.txt") == Url(scheme: "http", authority: "www.ietf.org", host: "www.ietf.org", path: "/rfc/rfc2396.txt")
assert parseURL("ldap://[2001:db8::7]/c=GB?objectClass?one") == Url(scheme: "ldap", authority: "[2001:db8::7]", host: "[2001:db8::7]", path: "/c=GB", query: "objectClass?one")

# Todo: Ugh, are these right?
assert parseURL("mailto:John.Doe@example.com") == Url(scheme: "mailto", path: "/John.Doe@example.com") # ugh is this right
assert parseURL("news:comp.infosystems.www.servers.unix") == Url(scheme: "news", path: "/comp.infosystems.www.servers.unix")
assert parseURL("tel:+1-816-555-1212") == Url(scheme: "tel", path: "/+1-816-555-1212")
assert parseURL("telnet://192.0.2.16:80/") == Url(scheme: "telnet", authority: "192.0.2.16:80", host: "192.0.2.16", port: 80, path: "/")
assert parseURL("urn:oasis:names:specification:docbook:dtd:xml:4.1.2") == Url(scheme: "urn", path: "/oasis:names:specification:docbook:dtd:xml:4.1.2")

func encodeURL(url: string): string =
  for c in url:
    if c in unreserved:
      result &= c
    # Note: `+`?
    else:
      result &= "%" & toHex(ord(c), 2)