From aace82a89073d3547856742e9f3a403b1f3fba03 Mon Sep 17 00:00:00 2001 From: j-james Date: Fri, 1 Jul 2022 22:58:19 -0700 Subject: Move URI handling and rename node.nested to node.children --- src/browser.nim | 11 +-- src/formats/html.nim | 15 ++-- src/formats/uri.nim | 182 +++++++++++++++++++++++++++++++++++++++++++++++++ src/gui/terminal.nim | 4 +- src/protocols/http.nim | 2 +- src/uri.nim | 182 ------------------------------------------------- 6 files changed, 199 insertions(+), 197 deletions(-) create mode 100644 src/formats/uri.nim delete mode 100644 src/uri.nim diff --git a/src/browser.nim b/src/browser.nim index 97713a3..c8fdea8 100644 --- a/src/browser.nim +++ b/src/browser.nim @@ -1,21 +1,21 @@ -import std/strutils, protocols/http, html, uri +import std/strutils, protocols/http, formats/[html, uri], gui/terminal let url = "https://example.org:443/index.html" # let url = paramStr(1) -proc request(uri: string) = +proc request*(uri: string) = # This is probably the best place to implement scheme-specific stuff let url = parseURL(uri) case url.scheme: of "http", "https": let response = httpRequest(url) - renderHTML(response.body) + render(parseHTML(response.body)) # Exercise: view-source of "view-source": # We must parse the url again without the view-source: prefix let url = uri.split(':', maxsplit=1)[1] let response = httpRequest(parseURL(url)) - renderSource(response.body) + renderSource(parseHTML((response.body))) # Exercise: file:// scheme of "file": discard @@ -25,7 +25,8 @@ proc request(uri: string) = else: raise newException(Exception, "Not a valid scheme: " & url.scheme) -request(url) +when isMainModule: + request(url) # HTTP/1.1 # Compression diff --git a/src/formats/html.nim b/src/formats/html.nim index 8724295..8299a51 100644 --- a/src/formats/html.nim +++ b/src/formats/html.nim @@ -13,16 +13,17 @@ type NodeKind* = enum # Clever node implementation from callsamu and XmlNodeObj # Note that Text nodes are _only_ text. # ex. this test node is three nodes: "this ", " node", and the tag -type Node* {.acyclic.} = object +type Node* = ref object + # parent*: Node # Unfortunately, we will have to deal with cycles. case kind*: NodeKind: of Text: text*: string of Element: tag*: string attributes*: Table[string, string] # change - nested*: seq[Node] + children*: seq[Node] -# Note that even plain text is valid HTML. +# Note that even plain text is valid HTML, in this implementation. type Html* = seq[Node] type ParserState = enum @@ -54,7 +55,7 @@ func conclude(buffer: string, unfinished: var seq[Node], result: var Html) = let split: seq[string] = buffer.strip(false, true, {'/'}).strip().split(' ') let tag = split[0].toLower let attributes = split[1..^1].attributes - let node = Node(kind: Element, tag: tag, attributes: attributes, nested: @[]) + let node = Node(kind: Element, tag: tag, attributes: attributes, children: @[]) # If we're in a self-closing tag: if tag in self_closing_tags: @@ -64,7 +65,7 @@ func conclude(buffer: string, unfinished: var seq[Node], result: var Html) = if tag.len > 0 and tag[0] == '/' or tag in self_closing_tags: # Add the element to the parent node if unfinished.len > 1: - unfinished[^2].nested.add(unfinished.pop) + unfinished[^2].children.add(unfinished.pop) # Or, if there is no parent node, add the element to the result else: result.add(unfinished.pop) @@ -75,7 +76,7 @@ func conclude(buffer: string, unfinished: var seq[Node], result: var Html) = func finish(unfinished: var seq[Node], result: var seq[Node]) = while unfinished.len > 1: - unfinished[^2].nested.add(unfinished.pop) + unfinished[^2].children.add(unfinished.pop) if unfinished.len == 1: result.add(unfinished.pop) @@ -90,7 +91,7 @@ func parseHTML*(html: string): Html = if not in_tag and c == '<': # Add the collected text content to the parent node, if there is text if buffer.strip() != "": - unfinished[^1].nested.add(Node(kind: Text, text: buffer)) + unfinished[^1].children.add(Node(kind: Text, text: buffer)) in_tag = true buffer = "" # End of a tag diff --git a/src/formats/uri.nim b/src/formats/uri.nim new file mode 100644 index 0000000..e2fc7f3 --- /dev/null +++ b/src/formats/uri.nim @@ -0,0 +1,182 @@ +import std/strutils + +# https://datatracker.ietf.org/doc/html/rfc3986 +type Url* = object + scheme*: string # : + authority*: string # // + userinfo*: string # @ + host*: string # . + port*: int # : + path*: string # / + query*: string # ? + fragment*: string # # + +const + gendelims* = {':', '/', '?', '#', '[', ']', '@'} + subdelims* = {'!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '='} + reserved* = gendelims + subdelims + unreserved* = {'a'..'z', 'A'..'Z', '0'..'9', '-', '.', '_', '~'} + otherchars* = {'"', '%', '<', '>', '\\', '^', '`', '{', '|', '}', } # plus, whitespace and control characters + +# This might be spec-compliant. +func parseURL*(url: string): Url = + # Each URI must begin with a scheme name. + # Scheme names consist of a sequence of characters beginning with a letter, + # and followed by any combination of letters, digits, plus, period, or hyphen. + var split = url.split(':', maxsplit=1) + if split.len == 2: + # Although schemes are case-insensitive, an implementation should produce + # lowercase scheme names for consistency. + result.scheme = split[0].toLower + assert allCharsInSet(result.scheme, {'a'..'z', '0'..'9', '+', '-', '.'}) + + # The authority component is preceded by a double slash, and is terminated by + # the next slash, question mark, or pound sign, or by the end of the URI. + if split[1][0..1] == "//": + result.authority = split[1][2..^1].split({'/', '?', '#'}, maxsplit=1)[0] + split[1] = split[1][result.authority.len + 2 ..< split[1].len] + + # A fragment identifier component is indicate by the presense of a pound sign + # character, and terminated by the end of the URI. + split = split[1].split('#', maxsplit=1) + if split.len == 2: + result.fragment = split[1] + + # The query component is indicated by the first question mark character, + # and terminated by a pound sign or the end of the URI. + split = split[0].split('?', maxsplit=1) + if split.len == 2: + result.query = split[1] + + # Todo: path parsing is actually more complex. this is cheating. + if split[0] != "": + result.path = "/" & split[0].strip(leading=true, trailing=false, {'/'}) + + if result.authority != "": + # The port subcomponent, if present, is designated by an optional port number + # in decimal following the host and deliminated from it by a single colon. + split = result.authority.rsplit({':'}, maxsplit=1) + if split.len == 2: + # If we accidentally parsed an IPv6 address: reconstruct + if ']' in split[1]: + split[0] = split[0] & ":" & split[1] + split[1] = "" + else: + result.port = split[1].parseInt + assert allCharsInSet(split[1], {'0'..'9'}) + + # The userinfo subcomponent, if present, is followed by an at-sign + # that delimits it from the host. + split = split[0].split('@', maxsplit=1) + if split.len == 2: + result.userinfo = split[0] + # Todo: parse hosts + result.host = split[1] + else: + result.host = split[0] + # Todo: is this assert right? + assert allCharsInSet(result.host, unreserved + subdelims + {':', '[', ']'}) + else: + raise newException(RangeDefect, "URL does not begin with a scheme") + + #[ Todo: advanced resolution and relative references + # A relative reference takes advantage of the hierarchical syntax to express + # a reference relative to the name space of another hierarchical URI. + if url.len == 0: + raise newException(RangeDefect, "URL is an empty string") + # Relative path reference + elif url[0] == '.': + return + # Network path reference + elif url.len > 1 and url[0..1] == "//": + return + # Absolute path reference + elif url[0] == '/': + discard url.split('/', maxsplit=1) + ]# + +#[ +# This is compact. But it's probably not spec-compliant. +func parseURL*(url: string): Url = + # https://google.com/path?query#fragment + # https, //google.com/path?query#fragment + var split: seq[string] = url.split(':', maxsplit=1) + if split.len == 2: + result.scheme = split[0] + + # //google.com/path?query, fragment + split = split[1].rsplit('#', maxsplit=1) + if split.len == 2: + result.fragment = split[1] + + # //google.com/path, query + # //google.com/, query + split = split[0].rsplit('?', maxsplit=1) + if split.len == 2: + result.query = split[1] + + # //google.com/path + # //google.com + # somethingelse + if split[0][0..1] == "//": + split = split[0].strip(leading=true, trailing=false, {'/'}).split('/', maxsplit=1) + result.authority = split[0] + if split.len == 2: + result.path = split[1] + else: + # Note: This is a weird state. + result.path = split[0].strip(leading=true, trailing=true, {'/'}) + debugEcho split[0] + + # userinfo@host:port + # userinfo, host:port + var remainder = result.authority + split = result.authority.split('@', maxsplit=1) + if split.len == 2: + result.userinfo = split[0] + remainder = split[1] + + # host, port + split = remainder.rsplit(':', maxsplit=1) + remainder = split[0] + if split.len == 2: + result.port = split[1].parseInt + + result.host = split[0] + + else: + raise newException(OSError, "URL does not contain a colon") +]# + +# assert parseURL("google.com") == Url(authority: "google.com", host: "google.com") +# assert parseURL("http://") == Url(scheme: "http") +assert parseURL("https://google.com") == Url(scheme: "https", authority: "google.com", host: "google.com") +assert parseURL("https://google.com/path//path") == Url(scheme: "https", authority: "google.com", host: "google.com", path: "/path//path") +assert parseURL("https://google.com/path?query") == Url(scheme: "https", authority: "google.com", host: "google.com", path: "/path", query: "query") +assert parseURL("https://google.com/path/?query") == Url(scheme: "https", authority: "google.com", host: "google.com", path: "/path/", query: "query") +assert parseURL("https://google.com?query") == Url(scheme: "https", authority: "google.com", host: "google.com", query: "query") +assert parseURL("https://google.com/path#fragment") == Url(scheme: "https", authority: "google.com", host: "google.com", path: "/path", fragment: "fragment") +assert parseURL("https://google.com/path#fragment?query") == Url(scheme: "https", authority: "google.com", host: "google.com", path: "/path", fragment: "fragment?query") +assert parseURL("https://192.168.0.1/path#fragment?query") == Url(scheme: "https", authority: "192.168.0.1", host: "192.168.0.1", path: "/path", fragment: "fragment?query") +assert parseURL("https://12:12:12:12/path#fragment?query") == Url(scheme: "https", authority: "12:12:12:12", host: "12:12:12", port: 12, path: "/path", fragment: "fragment?query") +assert parseURL("https://[12:213:213:fr:12]:1200/path#fragment?query") == Url(scheme: "https", authority: "[12:213:213:fr:12]:1200", host: "[12:213:213:fr:12]", port: 1200, path: "/path", fragment: "fragment?query") +assert parseURL("https://userinfo@host.com:8080/path#fragment?query") == Url(scheme: "https", authority: "userinfo@host.com:8080", userinfo: "userinfo", host: "host.com", port: 8080, path: "/path", fragment: "fragment?query") + +assert parseURL("ftp://ftp.is.co.za/rfc/rfc1808.txt") == Url(scheme: "ftp", authority: "ftp.is.co.za", host: "ftp.is.co.za", path: "/rfc/rfc1808.txt") +assert parseURL("http://www.ietf.org/rfc/rfc2396.txt") == Url(scheme: "http", authority: "www.ietf.org", host: "www.ietf.org", path: "/rfc/rfc2396.txt") +assert parseURL("ldap://[2001:db8::7]/c=GB?objectClass?one") == Url(scheme: "ldap", authority: "[2001:db8::7]", host: "[2001:db8::7]", path: "/c=GB", query: "objectClass?one") + +# Todo: Ugh, are these right? +assert parseURL("mailto:John.Doe@example.com") == Url(scheme: "mailto", path: "/John.Doe@example.com") # ugh is this right +assert parseURL("news:comp.infosystems.www.servers.unix") == Url(scheme: "news", path: "/comp.infosystems.www.servers.unix") +assert parseURL("tel:+1-816-555-1212") == Url(scheme: "tel", path: "/+1-816-555-1212") +assert parseURL("telnet://192.0.2.16:80/") == Url(scheme: "telnet", authority: "192.0.2.16:80", host: "192.0.2.16", port: 80, path: "/") +assert parseURL("urn:oasis:names:specification:docbook:dtd:xml:4.1.2") == Url(scheme: "urn", path: "/oasis:names:specification:docbook:dtd:xml:4.1.2") + +func encodeURL(url: string): string = + for c in url: + if c in unreserved: + result &= c + # Note: `+`? + else: + result &= "%" & toHex(ord(c), 2) diff --git a/src/gui/terminal.nim b/src/gui/terminal.nim index e893081..f4c7c92 100644 --- a/src/gui/terminal.nim +++ b/src/gui/terminal.nim @@ -9,7 +9,7 @@ proc print(node: Node, indent=0, raw=false) = stdout.write(" " & attribute[0] & "=" & attribute[1]) stdout.write(">") stdout.write('\n') - for i in node.nested: + for i in node.children: i.print(indent+2) if not raw: stdout.write(" ".repeat(indent)) @@ -28,7 +28,7 @@ proc renderSource*(html: Html) = node.print(0, true) when isMainModule: - import ../protocols/http, ../uri + import ../protocols/http, ../formats/uri let url = "https://example.org:443/index.html" let request = httpRequest(parseUrl(url)) let parsed = parseHTML(request.body) diff --git a/src/protocols/http.nim b/src/protocols/http.nim index df812fd..2de11df 100644 --- a/src/protocols/http.nim +++ b/src/protocols/http.nim @@ -1,4 +1,4 @@ -import std/[strutils, net], ../uri +import std/[strutils, net], ../formats/uri # https://datatracker.ietf.org/doc/html/rfc1945 diff --git a/src/uri.nim b/src/uri.nim deleted file mode 100644 index e2fc7f3..0000000 --- a/src/uri.nim +++ /dev/null @@ -1,182 +0,0 @@ -import std/strutils - -# https://datatracker.ietf.org/doc/html/rfc3986 -type Url* = object - scheme*: string # : - authority*: string # // - userinfo*: string # @ - host*: string # . - port*: int # : - path*: string # / - query*: string # ? - fragment*: string # # - -const - gendelims* = {':', '/', '?', '#', '[', ']', '@'} - subdelims* = {'!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '='} - reserved* = gendelims + subdelims - unreserved* = {'a'..'z', 'A'..'Z', '0'..'9', '-', '.', '_', '~'} - otherchars* = {'"', '%', '<', '>', '\\', '^', '`', '{', '|', '}', } # plus, whitespace and control characters - -# This might be spec-compliant. -func parseURL*(url: string): Url = - # Each URI must begin with a scheme name. - # Scheme names consist of a sequence of characters beginning with a letter, - # and followed by any combination of letters, digits, plus, period, or hyphen. - var split = url.split(':', maxsplit=1) - if split.len == 2: - # Although schemes are case-insensitive, an implementation should produce - # lowercase scheme names for consistency. - result.scheme = split[0].toLower - assert allCharsInSet(result.scheme, {'a'..'z', '0'..'9', '+', '-', '.'}) - - # The authority component is preceded by a double slash, and is terminated by - # the next slash, question mark, or pound sign, or by the end of the URI. - if split[1][0..1] == "//": - result.authority = split[1][2..^1].split({'/', '?', '#'}, maxsplit=1)[0] - split[1] = split[1][result.authority.len + 2 ..< split[1].len] - - # A fragment identifier component is indicate by the presense of a pound sign - # character, and terminated by the end of the URI. - split = split[1].split('#', maxsplit=1) - if split.len == 2: - result.fragment = split[1] - - # The query component is indicated by the first question mark character, - # and terminated by a pound sign or the end of the URI. - split = split[0].split('?', maxsplit=1) - if split.len == 2: - result.query = split[1] - - # Todo: path parsing is actually more complex. this is cheating. - if split[0] != "": - result.path = "/" & split[0].strip(leading=true, trailing=false, {'/'}) - - if result.authority != "": - # The port subcomponent, if present, is designated by an optional port number - # in decimal following the host and deliminated from it by a single colon. - split = result.authority.rsplit({':'}, maxsplit=1) - if split.len == 2: - # If we accidentally parsed an IPv6 address: reconstruct - if ']' in split[1]: - split[0] = split[0] & ":" & split[1] - split[1] = "" - else: - result.port = split[1].parseInt - assert allCharsInSet(split[1], {'0'..'9'}) - - # The userinfo subcomponent, if present, is followed by an at-sign - # that delimits it from the host. - split = split[0].split('@', maxsplit=1) - if split.len == 2: - result.userinfo = split[0] - # Todo: parse hosts - result.host = split[1] - else: - result.host = split[0] - # Todo: is this assert right? - assert allCharsInSet(result.host, unreserved + subdelims + {':', '[', ']'}) - else: - raise newException(RangeDefect, "URL does not begin with a scheme") - - #[ Todo: advanced resolution and relative references - # A relative reference takes advantage of the hierarchical syntax to express - # a reference relative to the name space of another hierarchical URI. - if url.len == 0: - raise newException(RangeDefect, "URL is an empty string") - # Relative path reference - elif url[0] == '.': - return - # Network path reference - elif url.len > 1 and url[0..1] == "//": - return - # Absolute path reference - elif url[0] == '/': - discard url.split('/', maxsplit=1) - ]# - -#[ -# This is compact. But it's probably not spec-compliant. -func parseURL*(url: string): Url = - # https://google.com/path?query#fragment - # https, //google.com/path?query#fragment - var split: seq[string] = url.split(':', maxsplit=1) - if split.len == 2: - result.scheme = split[0] - - # //google.com/path?query, fragment - split = split[1].rsplit('#', maxsplit=1) - if split.len == 2: - result.fragment = split[1] - - # //google.com/path, query - # //google.com/, query - split = split[0].rsplit('?', maxsplit=1) - if split.len == 2: - result.query = split[1] - - # //google.com/path - # //google.com - # somethingelse - if split[0][0..1] == "//": - split = split[0].strip(leading=true, trailing=false, {'/'}).split('/', maxsplit=1) - result.authority = split[0] - if split.len == 2: - result.path = split[1] - else: - # Note: This is a weird state. - result.path = split[0].strip(leading=true, trailing=true, {'/'}) - debugEcho split[0] - - # userinfo@host:port - # userinfo, host:port - var remainder = result.authority - split = result.authority.split('@', maxsplit=1) - if split.len == 2: - result.userinfo = split[0] - remainder = split[1] - - # host, port - split = remainder.rsplit(':', maxsplit=1) - remainder = split[0] - if split.len == 2: - result.port = split[1].parseInt - - result.host = split[0] - - else: - raise newException(OSError, "URL does not contain a colon") -]# - -# assert parseURL("google.com") == Url(authority: "google.com", host: "google.com") -# assert parseURL("http://") == Url(scheme: "http") -assert parseURL("https://google.com") == Url(scheme: "https", authority: "google.com", host: "google.com") -assert parseURL("https://google.com/path//path") == Url(scheme: "https", authority: "google.com", host: "google.com", path: "/path//path") -assert parseURL("https://google.com/path?query") == Url(scheme: "https", authority: "google.com", host: "google.com", path: "/path", query: "query") -assert parseURL("https://google.com/path/?query") == Url(scheme: "https", authority: "google.com", host: "google.com", path: "/path/", query: "query") -assert parseURL("https://google.com?query") == Url(scheme: "https", authority: "google.com", host: "google.com", query: "query") -assert parseURL("https://google.com/path#fragment") == Url(scheme: "https", authority: "google.com", host: "google.com", path: "/path", fragment: "fragment") -assert parseURL("https://google.com/path#fragment?query") == Url(scheme: "https", authority: "google.com", host: "google.com", path: "/path", fragment: "fragment?query") -assert parseURL("https://192.168.0.1/path#fragment?query") == Url(scheme: "https", authority: "192.168.0.1", host: "192.168.0.1", path: "/path", fragment: "fragment?query") -assert parseURL("https://12:12:12:12/path#fragment?query") == Url(scheme: "https", authority: "12:12:12:12", host: "12:12:12", port: 12, path: "/path", fragment: "fragment?query") -assert parseURL("https://[12:213:213:fr:12]:1200/path#fragment?query") == Url(scheme: "https", authority: "[12:213:213:fr:12]:1200", host: "[12:213:213:fr:12]", port: 1200, path: "/path", fragment: "fragment?query") -assert parseURL("https://userinfo@host.com:8080/path#fragment?query") == Url(scheme: "https", authority: "userinfo@host.com:8080", userinfo: "userinfo", host: "host.com", port: 8080, path: "/path", fragment: "fragment?query") - -assert parseURL("ftp://ftp.is.co.za/rfc/rfc1808.txt") == Url(scheme: "ftp", authority: "ftp.is.co.za", host: "ftp.is.co.za", path: "/rfc/rfc1808.txt") -assert parseURL("http://www.ietf.org/rfc/rfc2396.txt") == Url(scheme: "http", authority: "www.ietf.org", host: "www.ietf.org", path: "/rfc/rfc2396.txt") -assert parseURL("ldap://[2001:db8::7]/c=GB?objectClass?one") == Url(scheme: "ldap", authority: "[2001:db8::7]", host: "[2001:db8::7]", path: "/c=GB", query: "objectClass?one") - -# Todo: Ugh, are these right? -assert parseURL("mailto:John.Doe@example.com") == Url(scheme: "mailto", path: "/John.Doe@example.com") # ugh is this right -assert parseURL("news:comp.infosystems.www.servers.unix") == Url(scheme: "news", path: "/comp.infosystems.www.servers.unix") -assert parseURL("tel:+1-816-555-1212") == Url(scheme: "tel", path: "/+1-816-555-1212") -assert parseURL("telnet://192.0.2.16:80/") == Url(scheme: "telnet", authority: "192.0.2.16:80", host: "192.0.2.16", port: 80, path: "/") -assert parseURL("urn:oasis:names:specification:docbook:dtd:xml:4.1.2") == Url(scheme: "urn", path: "/oasis:names:specification:docbook:dtd:xml:4.1.2") - -func encodeURL(url: string): string = - for c in url: - if c in unreserved: - result &= c - # Note: `+`? - else: - result &= "%" & toHex(ord(c), 2) -- cgit v1.2.3-70-g09d2