aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorj-james2022-06-02 22:02:46 +0000
committerj-james2022-06-24 06:23:44 +0000
commitb629bff6211a18327d49d34eaee5a98471308030 (patch)
tree68ac42d82fdbaaf1dac6377118108bb69c389b3a /src
Basic HTTP support and URL parsing
Diffstat (limited to 'src')
-rw-r--r--src/browser.nim33
-rw-r--r--src/html.nim30
-rw-r--r--src/protocols/http.nim67
-rw-r--r--src/uri.nim182
4 files changed, 312 insertions, 0 deletions
diff --git a/src/browser.nim b/src/browser.nim
new file mode 100644
index 0000000..97713a3
--- /dev/null
+++ b/src/browser.nim
@@ -0,0 +1,33 @@
+import std/strutils, protocols/http, html, uri
+
+let url = "https://example.org:443/index.html"
+# let url = paramStr(1)
+
+proc request(uri: string) =
+ # This is probably the best place to implement scheme-specific stuff
+ let url = parseURL(uri)
+ case url.scheme:
+ of "http", "https":
+ let response = httpRequest(url)
+ renderHTML(response.body)
+ # Exercise: view-source
+ of "view-source":
+ # We must parse the url again without the view-source: prefix
+ let url = uri.split(':', maxsplit=1)[1]
+ let response = httpRequest(parseURL(url))
+ renderSource(response.body)
+ # Exercise: file:// scheme
+ of "file":
+ discard
+ # Exercise: data scheme
+ of "data":
+ discard
+ else:
+ raise newException(Exception, "Not a valid scheme: " & url.scheme)
+
+request(url)
+
+# HTTP/1.1
+# Compression
+# Redirects
+# Caching
diff --git a/src/html.nim b/src/html.nim
new file mode 100644
index 0000000..4ae5ee1
--- /dev/null
+++ b/src/html.nim
@@ -0,0 +1,30 @@
+type Html = object
+ tags: seq[string]
+
+type Tag {.acyclic.} = object
+ name: string
+ text: string
+ nested: seq[Tag]
+
+func parseHTML(html: string): Html = discard
+
+# Todo: revamp parsing: keep track of tags, entities, etc
+proc renderHTML*(html: string) =
+ var
+ in_angle = false
+ in_body = false
+
+ # _Why_ is it i, c and not c, i...
+ for i, c in html:
+ if c == '<':
+ in_angle = true
+ if html[i..i+4] == "<body":
+ in_body = true
+ elif c == '>':
+ in_angle = false
+ elif not in_angle and in_body and c in {char(32)..char(126), '\n'}:
+ stdout.write(c)
+
+proc renderSource*(html: string) =
+ for i, c in html:
+ stdout.write(c)
diff --git a/src/protocols/http.nim b/src/protocols/http.nim
new file mode 100644
index 0000000..df812fd
--- /dev/null
+++ b/src/protocols/http.nim
@@ -0,0 +1,67 @@
+import std/[strutils, net], ../uri
+
+# https://datatracker.ietf.org/doc/html/rfc1945
+
+type Http* = object of RootObj
+ version*: string # always present
+ headers*: seq[tuple[header: string, value: string]]
+
+type HttpRequest* = object of Http
+ `method`*: string # only present in requests
+ uri*: string # only present in requests
+
+type HttpResponse* = object of Http
+ status*: int # status code, only in responses
+ reason*: string # status elaboration, only in responses
+ body*: string # html document, usually
+
+# This parses a HTTP response that has been split into headers and a body.
+func parseResponse*(http: seq[string], body: string): HttpResponse =
+ # let http: seq[string] = http.split("\r\n")
+ let split = http[0].split(' ', maxsplit=2)
+ if split.len == 3:
+ result.version = split[0]
+ result.status = split[1].parseInt
+ result.reason = split[2]
+ else:
+ raise newException(RangeDefect, "First line of response is invalid: " & http[0])
+ # Note: the spec specifies that \r\n\r\n marks the end of a request
+ for header in http[1 ..< http.len]:
+ let split = header.split(':', maxsplit=1)
+ if split.len == 2:
+ result.headers.add((split[0].toLower, split[1].strip()))
+ result.body = body
+
+proc httpRequest*(url: Url): HttpResponse =
+ let exampleRequest = "GET " & url.path & " HTTP/1.0\r\nHost: example.com\r\n\r\n"
+
+ let socket = newSocket(AF_INET, SOCK_STREAM, IPPROTO_TCP)
+
+ var port = 80
+ if url.scheme == "https":
+ port = 443
+ let ctx: SSLContext = newContext()
+ ctx.wrapSocket(socket)
+ if url.port != 0:
+ port = url.port
+
+ socket.connect(url.host, Port(port))
+ socket.send(exampleRequest)
+
+ var
+ response: seq[string]
+ buffer: string
+ while true:
+ socket.readLine(buffer)
+ if buffer == "\r\n":
+ break
+ response.add(buffer)
+
+ # assert "transfer-encoding" notin parsed.headers
+ # assert "content-encoding" notin parsed.headers
+
+ var body: string
+ while socket.recv(buffer, 1024) > 0: # why?
+ body &= buffer
+
+ result = parseResponse(response, body)
diff --git a/src/uri.nim b/src/uri.nim
new file mode 100644
index 0000000..e2fc7f3
--- /dev/null
+++ b/src/uri.nim
@@ -0,0 +1,182 @@
+import std/strutils
+
+# https://datatracker.ietf.org/doc/html/rfc3986
+type Url* = object
+ scheme*: string # :
+ authority*: string # //
+ userinfo*: string # @
+ host*: string # .
+ port*: int # :
+ path*: string # /
+ query*: string # ?
+ fragment*: string # #
+
+const
+ gendelims* = {':', '/', '?', '#', '[', ']', '@'}
+ subdelims* = {'!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '='}
+ reserved* = gendelims + subdelims
+ unreserved* = {'a'..'z', 'A'..'Z', '0'..'9', '-', '.', '_', '~'}
+ otherchars* = {'"', '%', '<', '>', '\\', '^', '`', '{', '|', '}', } # plus, whitespace and control characters
+
+# This might be spec-compliant.
+func parseURL*(url: string): Url =
+ # Each URI must begin with a scheme name.
+ # Scheme names consist of a sequence of characters beginning with a letter,
+ # and followed by any combination of letters, digits, plus, period, or hyphen.
+ var split = url.split(':', maxsplit=1)
+ if split.len == 2:
+ # Although schemes are case-insensitive, an implementation should produce
+ # lowercase scheme names for consistency.
+ result.scheme = split[0].toLower
+ assert allCharsInSet(result.scheme, {'a'..'z', '0'..'9', '+', '-', '.'})
+
+ # The authority component is preceded by a double slash, and is terminated by
+ # the next slash, question mark, or pound sign, or by the end of the URI.
+ if split[1][0..1] == "//":
+ result.authority = split[1][2..^1].split({'/', '?', '#'}, maxsplit=1)[0]
+ split[1] = split[1][result.authority.len + 2 ..< split[1].len]
+
+ # A fragment identifier component is indicate by the presense of a pound sign
+ # character, and terminated by the end of the URI.
+ split = split[1].split('#', maxsplit=1)
+ if split.len == 2:
+ result.fragment = split[1]
+
+ # The query component is indicated by the first question mark character,
+ # and terminated by a pound sign or the end of the URI.
+ split = split[0].split('?', maxsplit=1)
+ if split.len == 2:
+ result.query = split[1]
+
+ # Todo: path parsing is actually more complex. this is cheating.
+ if split[0] != "":
+ result.path = "/" & split[0].strip(leading=true, trailing=false, {'/'})
+
+ if result.authority != "":
+ # The port subcomponent, if present, is designated by an optional port number
+ # in decimal following the host and deliminated from it by a single colon.
+ split = result.authority.rsplit({':'}, maxsplit=1)
+ if split.len == 2:
+ # If we accidentally parsed an IPv6 address: reconstruct
+ if ']' in split[1]:
+ split[0] = split[0] & ":" & split[1]
+ split[1] = ""
+ else:
+ result.port = split[1].parseInt
+ assert allCharsInSet(split[1], {'0'..'9'})
+
+ # The userinfo subcomponent, if present, is followed by an at-sign
+ # that delimits it from the host.
+ split = split[0].split('@', maxsplit=1)
+ if split.len == 2:
+ result.userinfo = split[0]
+ # Todo: parse hosts
+ result.host = split[1]
+ else:
+ result.host = split[0]
+ # Todo: is this assert right?
+ assert allCharsInSet(result.host, unreserved + subdelims + {':', '[', ']'})
+ else:
+ raise newException(RangeDefect, "URL does not begin with a scheme")
+
+ #[ Todo: advanced resolution and relative references
+ # A relative reference takes advantage of the hierarchical syntax to express
+ # a reference relative to the name space of another hierarchical URI.
+ if url.len == 0:
+ raise newException(RangeDefect, "URL is an empty string")
+ # Relative path reference
+ elif url[0] == '.':
+ return
+ # Network path reference
+ elif url.len > 1 and url[0..1] == "//":
+ return
+ # Absolute path reference
+ elif url[0] == '/':
+ discard url.split('/', maxsplit=1)
+ ]#
+
+#[
+# This is compact. But it's probably not spec-compliant.
+func parseURL*(url: string): Url =
+ # https://google.com/path?query#fragment
+ # https, //google.com/path?query#fragment
+ var split: seq[string] = url.split(':', maxsplit=1)
+ if split.len == 2:
+ result.scheme = split[0]
+
+ # //google.com/path?query, fragment
+ split = split[1].rsplit('#', maxsplit=1)
+ if split.len == 2:
+ result.fragment = split[1]
+
+ # //google.com/path, query
+ # //google.com/, query
+ split = split[0].rsplit('?', maxsplit=1)
+ if split.len == 2:
+ result.query = split[1]
+
+ # //google.com/path
+ # //google.com
+ # somethingelse
+ if split[0][0..1] == "//":
+ split = split[0].strip(leading=true, trailing=false, {'/'}).split('/', maxsplit=1)
+ result.authority = split[0]
+ if split.len == 2:
+ result.path = split[1]
+ else:
+ # Note: This is a weird state.
+ result.path = split[0].strip(leading=true, trailing=true, {'/'})
+ debugEcho split[0]
+
+ # userinfo@host:port
+ # userinfo, host:port
+ var remainder = result.authority
+ split = result.authority.split('@', maxsplit=1)
+ if split.len == 2:
+ result.userinfo = split[0]
+ remainder = split[1]
+
+ # host, port
+ split = remainder.rsplit(':', maxsplit=1)
+ remainder = split[0]
+ if split.len == 2:
+ result.port = split[1].parseInt
+
+ result.host = split[0]
+
+ else:
+ raise newException(OSError, "URL does not contain a colon")
+]#
+
+# assert parseURL("google.com") == Url(authority: "google.com", host: "google.com")
+# assert parseURL("http://") == Url(scheme: "http")
+assert parseURL("https://google.com") == Url(scheme: "https", authority: "google.com", host: "google.com")
+assert parseURL("https://google.com/path//path") == Url(scheme: "https", authority: "google.com", host: "google.com", path: "/path//path")
+assert parseURL("https://google.com/path?query") == Url(scheme: "https", authority: "google.com", host: "google.com", path: "/path", query: "query")
+assert parseURL("https://google.com/path/?query") == Url(scheme: "https", authority: "google.com", host: "google.com", path: "/path/", query: "query")
+assert parseURL("https://google.com?query") == Url(scheme: "https", authority: "google.com", host: "google.com", query: "query")
+assert parseURL("https://google.com/path#fragment") == Url(scheme: "https", authority: "google.com", host: "google.com", path: "/path", fragment: "fragment")
+assert parseURL("https://google.com/path#fragment?query") == Url(scheme: "https", authority: "google.com", host: "google.com", path: "/path", fragment: "fragment?query")
+assert parseURL("https://192.168.0.1/path#fragment?query") == Url(scheme: "https", authority: "192.168.0.1", host: "192.168.0.1", path: "/path", fragment: "fragment?query")
+assert parseURL("https://12:12:12:12/path#fragment?query") == Url(scheme: "https", authority: "12:12:12:12", host: "12:12:12", port: 12, path: "/path", fragment: "fragment?query")
+assert parseURL("https://[12:213:213:fr:12]:1200/path#fragment?query") == Url(scheme: "https", authority: "[12:213:213:fr:12]:1200", host: "[12:213:213:fr:12]", port: 1200, path: "/path", fragment: "fragment?query")
+assert parseURL("https://userinfo@host.com:8080/path#fragment?query") == Url(scheme: "https", authority: "userinfo@host.com:8080", userinfo: "userinfo", host: "host.com", port: 8080, path: "/path", fragment: "fragment?query")
+
+assert parseURL("ftp://ftp.is.co.za/rfc/rfc1808.txt") == Url(scheme: "ftp", authority: "ftp.is.co.za", host: "ftp.is.co.za", path: "/rfc/rfc1808.txt")
+assert parseURL("http://www.ietf.org/rfc/rfc2396.txt") == Url(scheme: "http", authority: "www.ietf.org", host: "www.ietf.org", path: "/rfc/rfc2396.txt")
+assert parseURL("ldap://[2001:db8::7]/c=GB?objectClass?one") == Url(scheme: "ldap", authority: "[2001:db8::7]", host: "[2001:db8::7]", path: "/c=GB", query: "objectClass?one")
+
+# Todo: Ugh, are these right?
+assert parseURL("mailto:John.Doe@example.com") == Url(scheme: "mailto", path: "/John.Doe@example.com") # ugh is this right
+assert parseURL("news:comp.infosystems.www.servers.unix") == Url(scheme: "news", path: "/comp.infosystems.www.servers.unix")
+assert parseURL("tel:+1-816-555-1212") == Url(scheme: "tel", path: "/+1-816-555-1212")
+assert parseURL("telnet://192.0.2.16:80/") == Url(scheme: "telnet", authority: "192.0.2.16:80", host: "192.0.2.16", port: 80, path: "/")
+assert parseURL("urn:oasis:names:specification:docbook:dtd:xml:4.1.2") == Url(scheme: "urn", path: "/oasis:names:specification:docbook:dtd:xml:4.1.2")
+
+func encodeURL(url: string): string =
+ for c in url:
+ if c in unreserved:
+ result &= c
+ # Note: `+`?
+ else:
+ result &= "%" & toHex(ord(c), 2)