From a2e04ff18ad27be4dc1c66079941baaec79e003f Mon Sep 17 00:00:00 2001 From: JJ Date: Wed, 4 Jan 2023 15:57:41 -0800 Subject: Copy the last version of the parse_wiki_text crate in for development --- Cargo.toml | 7 +- parse_wiki_text/Cargo.toml | 9 + parse_wiki_text/LICENSE | 5 + parse_wiki_text/examples/test/main.rs | 51 + parse_wiki_text/examples/test/test.rs | 70 + parse_wiki_text/examples/test/test_cases.rs | 726 ++++++++ parse_wiki_text/readme.md | 107 ++ parse_wiki_text/src/bold_italic.rs | 33 + parse_wiki_text/src/case_folding_simple.rs | 2632 +++++++++++++++++++++++++++ parse_wiki_text/src/character_entity.rs | 22 + parse_wiki_text/src/comment.rs | 109 ++ parse_wiki_text/src/configuration.rs | 164 ++ parse_wiki_text/src/default.rs | 88 + parse_wiki_text/src/external_link.rs | 47 + parse_wiki_text/src/heading.rs | 88 + parse_wiki_text/src/html_entities.rs | 259 +++ parse_wiki_text/src/lib.rs | 604 ++++++ parse_wiki_text/src/line.rs | 248 +++ parse_wiki_text/src/link.rs | 196 ++ parse_wiki_text/src/list.rs | 221 +++ parse_wiki_text/src/magic_word.rs | 26 + parse_wiki_text/src/parse.rs | 208 +++ parse_wiki_text/src/positioned.rs | 86 + parse_wiki_text/src/redirect.rs | 86 + parse_wiki_text/src/state.rs | 174 ++ parse_wiki_text/src/table.rs | 631 +++++++ parse_wiki_text/src/tag.rs | 318 ++++ parse_wiki_text/src/template.rs | 248 +++ parse_wiki_text/src/trie.rs | 167 ++ parse_wiki_text/src/warning.rs | 110 ++ 30 files changed, 7738 insertions(+), 2 deletions(-) create mode 100644 parse_wiki_text/Cargo.toml create mode 100644 parse_wiki_text/LICENSE create mode 100644 parse_wiki_text/examples/test/main.rs create mode 100644 parse_wiki_text/examples/test/test.rs create mode 100644 parse_wiki_text/examples/test/test_cases.rs create mode 100644 parse_wiki_text/readme.md create mode 100644 parse_wiki_text/src/bold_italic.rs create mode 100644 parse_wiki_text/src/case_folding_simple.rs create mode 100644 parse_wiki_text/src/character_entity.rs create mode 100644 parse_wiki_text/src/comment.rs create mode 100644 parse_wiki_text/src/configuration.rs create mode 100644 parse_wiki_text/src/default.rs create mode 100644 parse_wiki_text/src/external_link.rs create mode 100644 parse_wiki_text/src/heading.rs create mode 100644 parse_wiki_text/src/html_entities.rs create mode 100644 parse_wiki_text/src/lib.rs create mode 100644 parse_wiki_text/src/line.rs create mode 100644 parse_wiki_text/src/link.rs create mode 100644 parse_wiki_text/src/list.rs create mode 100644 parse_wiki_text/src/magic_word.rs create mode 100644 parse_wiki_text/src/parse.rs create mode 100644 parse_wiki_text/src/positioned.rs create mode 100644 parse_wiki_text/src/redirect.rs create mode 100644 parse_wiki_text/src/state.rs create mode 100644 parse_wiki_text/src/table.rs create mode 100644 parse_wiki_text/src/tag.rs create mode 100644 parse_wiki_text/src/template.rs create mode 100644 parse_wiki_text/src/trie.rs create mode 100644 parse_wiki_text/src/warning.rs diff --git a/Cargo.toml b/Cargo.toml index 2fe11fd..f8924de 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,5 +7,8 @@ edition = "2021" [dependencies] bzip2 = "0.4.3" -parse_wiki_text = "0.1.5" -# peg = "0.8.1" + +[dependencies.parse_wiki_text] +version = "0.1.5" +path = "parse_wiki_text" + diff --git a/parse_wiki_text/Cargo.toml b/parse_wiki_text/Cargo.toml new file mode 100644 index 0000000..d40739b --- /dev/null +++ b/parse_wiki_text/Cargo.toml @@ -0,0 +1,9 @@ +[package] +authors = ["Fredrik Portström "] +description = "Parse wiki text from Mediawiki into a tree of elements" +edition = "2018" +license-file = "LICENSE" +name = "parse_wiki_text" +readme = "readme.md" +repository = "https://github.com/portstrom/parse_wiki_text" +version = "0.1.5" diff --git a/parse_wiki_text/LICENSE b/parse_wiki_text/LICENSE new file mode 100644 index 0000000..e445eee --- /dev/null +++ b/parse_wiki_text/LICENSE @@ -0,0 +1,5 @@ +Copyright 2019 Fredrik Portström + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/parse_wiki_text/examples/test/main.rs b/parse_wiki_text/examples/test/main.rs new file mode 100644 index 0000000..72872fc --- /dev/null +++ b/parse_wiki_text/examples/test/main.rs @@ -0,0 +1,51 @@ +// Copyright 2019 Fredrik Portström +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +extern crate parse_wiki_text; + +mod test; +mod test_cases; + +fn main() { + let mut args = std::env::args(); + match args.nth(1) { + None => return test::run_test(&Default::default()), + Some(command) => match &command as _ { + "file" => { + if let Some(path) = args.next() { + if args.next().is_none() { + match std::fs::read_to_string(path) { + Err(error) => { + eprintln!("Failed to read file: {}", error); + std::process::exit(1); + } + Ok(file_contents) => { + println!( + "{:#?}", + parse_wiki_text::Configuration::default().parse(&file_contents) + ); + return; + } + } + } + } + } + "text" => { + if let Some(wiki_text) = args.next() { + if args.next().is_none() { + println!( + "{:#?}", + parse_wiki_text::Configuration::default() + .parse(&wiki_text.replace("\\t", "\t").replace("\\n", "\n")) + ); + return; + } + } + } + _ => {} + }, + } + eprintln!("invalid use"); + std::process::exit(1); +} diff --git a/parse_wiki_text/examples/test/test.rs b/parse_wiki_text/examples/test/test.rs new file mode 100644 index 0000000..9f04942 --- /dev/null +++ b/parse_wiki_text/examples/test/test.rs @@ -0,0 +1,70 @@ +// Copyright 2019 Fredrik Portström +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +use crate::test_cases::TEST_CASES; + +pub fn run_test(configuration: &parse_wiki_text::Configuration) { + let mut output = concat!( + "Parse Wiki Text test cases", + "", + "
" + ).to_owned(); + if let Some(window) = TEST_CASES + .windows(2) + .find(|window| window[0].0 >= window[1].0) + { + panic!("Sort: {:#?}", (window[0].0, window[1].0)); + } + for (title, test_cases) in TEST_CASES { + if let Some(window) = test_cases.windows(2).find(|window| window[0] >= window[1]) { + panic!("Sort: {:#?}", window); + } + output += &format!("", title.replace(" ", "_")); + output += title; + output += &format!(" {}", test_cases.len()); + } + output += "
"; + for (title, test_cases) in TEST_CASES { + output += &format!("

", title.replace(" ", "_")); + output += title; + output += "

"; + for wiki_text in *test_cases { + output += "
";
+            output += &wiki_text
+                .replace("&", "&")
+                .replace("<", "<")
+                .replace("\t", "")
+                .replace("\n", "\n")
+                .replace(" ", "·")
+                .replace("", "");
+            match std::panic::catch_unwind(|| configuration.parse(wiki_text)) {
+                Err(_) => {
+                    eprintln!("Panic with wiki text {:?}", wiki_text);
+                    output += "

panic
"; + } + Ok(result) => { + output += "
";
+                    output += &format!("{:#?}", result)
+                        .replace("&", "&")
+                        .replace("<", "<");
+                    output += "
"; + } + } + } + } + output += ""; + if let Err(error) = std::fs::write("report.html", output) { + eprintln!("Failed to write report: {}", error); + std::process::exit(1); + } +} diff --git a/parse_wiki_text/examples/test/test_cases.rs b/parse_wiki_text/examples/test/test_cases.rs new file mode 100644 index 0000000..da7b374 --- /dev/null +++ b/parse_wiki_text/examples/test/test_cases.rs @@ -0,0 +1,726 @@ +// Copyright 2019 Fredrik Portström +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +pub const TEST_CASES: &[(&str, &[&str])] = &[ + ( + "basic", + &[ + "", + "\t", + "\t\n", + "\t alpha", + "\talpha", + "\n", + "\n\t", + "\n\n\nalpha", + "\n\nalpha", + "\n\nalpha\n\n", + "\n \nalpha", + "\nalpha", + "\nalpha\n", + " ", + " \n\nalpha", + " \nalpha", + " ", + "!!", + "alpha", + "alpha\t", + "alpha\n", + "alpha\n\t", + "alpha\n\n", + "alpha\n\n\n", + "alpha\n\n ", + "alpha\n ", + "alpha\n \n", + "alpha\nbeta", + "alpha ", + "alpha \n", + ], + ), + ( + "bold italic", + &[ + "'", + "''", + "'''", + "''''", + "'''''", + "''''''", + "'''''''", + "''''''''", + "'''alpha", + "'''alpha''", + "'''alpha'''", + "''alpha", + "''alpha''", + "''alpha'''", + "alpha''", + "alpha'''", + "alpha'''beta", + "alpha'''beta'''gamma", + "alpha'''beta''gamma", + "alpha''beta", + "alpha''beta'''gamma", + "alpha''beta''gamma", + ], + ), + ( + "character entity", + &[ + "≪", + "Ö", + "<", + "<ö", + "<", + "< alpha", + "<ö", + "<alpha", + "ö", + "alpha <", + "alpha < beta", + "alpha<", + "alpha<beta", + ], + ), + ( + "comment", + &[ + "", + "beta", + "beta", + "-->beta", + "", + " beta", + "", + "beta", + "beta", + "alpha ", + "{{", + "{{alpha}}", + "}}", + ], + ), + ( + "paragraph break", + &[ + "alpha\t\n\nbeta", + "alpha\n\t\nbeta", + "alpha\n\n\t beta", + "alpha\n\n\tbeta", + "alpha\n\n\n\nbeta", + "alpha\n\n\nbeta", + "alpha\n\nbeta", + "alpha\n \nbeta", + "alpha \n\nbeta", + ], + ), + ( + "parameter", + &[ + "*alpha}}}", + "[[alpha|beta}}}]]", + "{{{", + "{{{\talpha}}}", + "{{{\nalpha}}}", + "{{{''}}}", + "{{{[[alpha|beta}}}", + "{{{alpha\t|beta}}}", + "{{{alpha\t}}}", + "{{{alpha\n|beta}}}", + "{{{alpha\n}}}", + "{{{alpha |beta}}}", + "{{{alpha }}}", + "{{{alpha|", + "{{{alpha|\tbeta}}}", + "{{{alpha|\t|}}}", + "{{{alpha|\t}}}", + "{{{alpha|\nbeta}}}", + "{{{alpha|\n|}}}", + "{{{alpha|\n}}}", + "{{{alpha| beta|}}}", + "{{{alpha| |}}}", + "{{{alpha| }}}", + "{{{alpha|beta\t|}}}", + "{{{alpha|beta\n|}}}", + "{{{alpha|beta |}}}", + "{{{alpha|beta|", + "{{{alpha|beta|\n}}}", + "{{{alpha|beta|gamma}}}", + "{{{alpha|beta|}}}", + "{{{alpha|beta}}}", + "{{{alpha|}}}", + "{{{alpha}}}", + "{{{|''}}}", + "{{{||}}}", + "{{{|}}}", + "{{{}}}", + "}}}", + ], + ), + ( + "preformatted block", + &[ + " alpha", + " alpha", + " alpha\n\n\nbeta", + " alpha\n\nbeta", + " alpha\n beta", + " alpha\n beta\n gamma", + " alpha\n beta\ngamma", + " alpha\nbeta", + " alpha\nbeta\n gamma", + "alpha\t\n beta", + "alpha\n\n beta", + "alpha\n \n beta", + "alpha\n =beta=\ngamma", + "alpha\n beta", + "alpha\n beta\n gamma", + "alpha\n beta\ngamma", + "alpha \n beta", + ], + ), + ( + "redirect", + &[ + "\t#REDIRECT[[alpha]]", + "\n\n#REDIRECT[[alpha]]", + "\n #REDIRECT[[alpha]]", + "\n#REDIRECT [[alpha]]", + " \n#REDIRECT[[alpha]]", + " #REDIRECT[[alpha]]", + " #REDIRECT[[alpha]]", + "#REDIRECT\t:[[alpha]]", + "#REDIRECT\t[[alpha]]", + "#REDIRECT\n\n[[alpha]]", + "#REDIRECT\n [[alpha]]", + "#REDIRECT\n:\n[[alpha]]", + "#REDIRECT\n:[[alpha]]", + "#REDIRECT\n[[alpha]]", + "#REDIRECT \n[[alpha]]", + "#REDIRECT [[alpha]]", + "#REDIRECT : [[alpha]]", + "#REDIRECT :[[alpha]]", + "#REDIRECT [[alpha]]", + "#REDIRECT:\t[[alpha]]", + "#REDIRECT:\n[[alpha]]", + "#REDIRECT: [[alpha]]", + "#REDIRECT:[[alpha]]", + "#REDIRECT[[alpha]]", + "#REDIRECT[[alpha]]\n\nbeta", + "#REDIRECT[[alpha]]\n beta", + "#REDIRECT[[alpha]]\nbeta", + "#REDIRECT[[alpha]] \nbeta", + "#REDIRECT[[alpha]] beta", + "#REDIRECT[[alpha]] beta", + "#REDIRECT[[alpha]]''beta", + "#REDIRECT[[alpha]]beta", + "#REDIRECT[[alpha|]]", + "#REDIRECT[[alpha|]]beta", + "#REDIRECT[[alpha|beta\ngamma]]", + "#REDIRECT[[alpha|beta]]", + "#REDIRECT[[alpha|beta]]=gamma=", + "#REDIRECT[[alpha|beta]]gamma", + "#ReDiReCt[[alpha]]", + "#rEdIrEcT[[alpha]]", + "#redirect[[alpha]]", + ], + ), + ( + "table", + &[ + " {|\n |}", + " {|\n|}", + "alpha\n{|\nbeta\n|}", + "{|", + "{|\n |}", + "{|\n!\n alpha\n|}", + "{|\n!\n!\n|}", + "{|\n!\nalpha\n\nbeta\n|}", + "{|\n!\nalpha\n\n|}", + "{|\n!\nalpha\nbeta\n|}", + "{|\n!\nalpha \n|}", + "{|\n!\n|\n|}", + "{|\n!\n|-\n|}", + "{|\n!\n|}", + "{|\n! alpha\n|}", + "{|\n!!\n|}", + "{|\n!!!\n|}", + "{|\n!!!!\n|}", + "{|\n!!!|\n|}", + "{|\n!alpha\n\nbeta\n|}", + "{|\n!alpha\nbeta\n|}", + "{|\n!alpha\nbeta|gamma\n|}", + "{|\n!alpha\n|}", + "{|\n!alpha!!beta\n|}", + "{|\n!alpha!beta\n|}", + "{|\n!alpha|beta\n|}", + "{|\n!alpha||beta\n|}", + "{|\n!|\n|}", + "{|\n!|!!\n|}", + "{|\n!|alpha\n|}", + "{|\n!|alpha|beta\n|}", + "{|\n!||\n|}", + "{|\n!||alpha\n|}", + "{|\n!|||\n|}", + "{|\n*alpha\n|}", + "{|\n=alpha=\n|}", + "{|\nalpha\n|}", + "{|\n|", + "{|\n|\n alpha\n|}", + "{|\n|\n!\n|}", + "{|\n|\n*alpha\n|}", + "{|\n|\n=alpha=\n|}", + "{|\n|\nalpha\n\nbeta\n|}", + "{|\n|\nalpha\n\n|}", + "{|\n|\nalpha\nbeta\n|}", + "{|\n|\nalpha \n|}", + "{|\n|\n|\n|}", + "{|\n|\n|-\n|}", + "{|\n|\n|}", + "{|\n| alpha\n|}", + "{|\n|+\n alpha\n|}", + "{|\n|+\n*alpha\n|}", + "{|\n|+\n=alpha=\n|}", + "{|\n|+\nalpha\n\nbeta\n|}", + "{|\n|+\nalpha\nbeta\n|}", + "{|\n|+\nalpha\n|}", + "{|\n|+\n|+\n|}", + "{|\n|+\n|}", + "{|\n|+ alpha\n|}", + "{|\n|+!!\n|}", + "{|\n|+alpha\n\nbeta\n|}", + "{|\n|+alpha\nbeta\n|}", + "{|\n|+alpha\n|}", + "{|\n|+alpha \n|}", + "{|\n|+|\n|}", + "{|\n|+|alpha|\n|}", + "{|\n|+|alpha|beta\n|}", + "{|\n|+||\n|}", + "{|\n|+||alpha\n|}", + "{|\n|+|||\n|}", + "{|\n|-\n alpha\n|}", + "{|\n|-\n!\n|}", + "{|\n|-\n*alpha\n|}", + "{|\n|-\n=alpha=\n|}", + "{|\n|-\nalpha\n|}", + "{|\n|-\n|\n|}", + "{|\n|-\n|-\n|}", + "{|\n|-\n|}", + "{|\n|- alpha\n|}", + "{|\n|-alpha\n\n|}", + "{|\n|-alpha\n|}", + "{|\n|-alpha \n|}", + "{|\n|alpha\n\nbeta\n|}", + "{|\n|alpha\nbeta\n|}", + "{|\n|alpha\nbeta|gamma\n|}", + "{|\n|alpha\n|}", + "{|\n|alpha!!beta\n|}", + "{|\n|alpha!beta\n|}", + "{|\n|alpha|\n|}", + "{|\n|alpha|beta\n|}", + "{|\n|alpha||beta\n|}", + "{|\n||\n|}", + "{|\n||alpha\n|}", + "{|\n|||\n|}", + "{|\n||||\n|}", + "{|\n|}", + "{|\n|}\t\nalpha", + "{|\n|}\n\n\nalpha", + "{|\n|}\n\nalpha", + "{|\n|}\nalpha", + "{|\n|} \nalpha", + "{|\n|}alpha", + "{|alpha\nbeta\n|}", + "{|alpha\n|}", + ], + ), + ( + "tag", + &[ + "
", + "
", + "", + "", + "beta", + "", + "", + "", + " alpha", + "alpha", + "", + "", + "
", + "
", + "", + "
", + "", + "", + "beta", + "", + "", + "", + " alpha", + "alpha", + "", + "", + "
", + "
", + "alphabeta", + "", + "", + "", + "", + "\talpha", + "\nalpha", + " alpha", + "", + "alpha\t", + "alpha\n", + "alpha ", + "alpha", + "alpha", + ], + ), + ( + "template", + &[ + "*alpha}}", + "[[alpha|beta}}]]", + "alpha {{beta}}", + "alpha {{beta}} gamma", + "alpha{{beta}}", + "alpha{{beta}}gamma", + "{{\nalpha}}", + "{{''}}", + "{{[[alpha|beta}}", + "{{alpha", + "{{alpha\n|beta}}", + "{{alpha\n|}}", + "{{alpha\n}}", + "{{alpha|", + "{{alpha|\nbeta}}", + "{{alpha|\n}}", + "{{alpha| beta}}", + "{{alpha|''}}", + "{{alpha|beta", + "{{alpha|beta\n=gamma}}", + "{{alpha|beta\n}}", + "{{alpha|beta =gamma}}", + "{{alpha|beta }}", + "{{alpha|beta=\ngamma}}", + "{{alpha|beta= gamma}}", + "{{alpha|beta=gamma\n}}", + "{{alpha|beta=gamma }}", + "{{alpha|beta=gamma=delta}}", + "{{alpha|beta=gamma|delta=epsilon}}", + "{{alpha|beta=gamma|delta}}", + "{{alpha|beta=gamma}}", + "{{alpha|beta=}}", + "{{alpha|beta|gamma=delta}}", + "{{alpha|beta|gamma}}", + "{{alpha|beta}", + "{{alpha|beta}}", + "{{alpha|beta}} gamma", + "{{alpha|beta}}gamma", + "{{alpha|}", + "{{alpha|}}", + "{{alpha}", + "{{alpha}}", + "{{alpha}} beta", + "{{alpha}}beta", + "}}", + ], + ), +]; diff --git a/parse_wiki_text/readme.md b/parse_wiki_text/readme.md new file mode 100644 index 0000000..b6de3bc --- /dev/null +++ b/parse_wiki_text/readme.md @@ -0,0 +1,107 @@ + + +# Parse Wiki Text + +Parse wiki text from Mediawiki into a tree of elements. + +![Parse Wiki Text](https://portstrom.com/parse_wiki_text.svg) + +## Introduction + +Wiki text is a format that follows the PHP maxim “Make everything as inconsistent and confusing as possible”. There are hundreds of millions of interesting documents written in this format, distributed under free licenses on sites that use the Mediawiki software, mainly Wikipedia and Wiktionary. Being able to parse wiki text and process these documents would allow access to a significant part of the world's knowledge. + +The Mediawiki software itself transforms a wiki text document into an HTML document in an outdated format to be displayed in a browser for a human reader. It does so through a [step by step procedure](https://www.mediawiki.org/wiki/Manual:Parser.php) of string substitutions, with some of the steps depending on the result of previous steps. [The main file for this procedure](https://doc.wikimedia.org/mediawiki-core/master/php/Parser_8php_source.html) has 6200 lines of code and the [second biggest file](https://doc.wikimedia.org/mediawiki-core/master/php/Preprocessor__DOM_8php_source.html) has 2000, and then there is a [1400 line file](https://doc.wikimedia.org/mediawiki-core/master/php/ParserOptions_8php_source.html) just to take options for the parser. + +What would be more interesting is to parse the wiki text document into a structure that can be used by a computer program to reason about the facts in the document and present them in different ways, making them available for a great variety of applications. + +Some people have tried to parse wiki text using regular expressions. This is incredibly naive and fails as soon as the wiki text is non-trivial. The capabilities of regular expressions don't come anywhere close to the complexity of the weirdness required to correctly parse wiki text. One project did a brave attempt to use a parser generator to parse wiki text. Wiki text was however never designed for formal parsers, so even parser generators are of no help in correctly parsing wiki text. + +Wiki text has a long history of poorly designed additions carelessly piled on top of each other. The syntax of wiki text is different in each wiki depending on its configuration. You can't even know what's a start tag until you see the corresponding end tag, and you can't know where the end tag is unless you parse the entire hierarchy of nested tags between the start tag and the end tag. In short: If you think you understand wiki text, you don't understand wiki text. + +Parse Wiki Text attempts to take all uncertainty out of parsing wiki text by converting it to another format that is easy to work with. The target format is Rust objects that can ergonomically be processed using iterators and match expressions. + +## Design goals + +### Correctness + +Parse Wiki Text is designed to parse wiki text exactly as parsed by Mediawiki. Even when there is obviously a bug in Mediawiki, Parse Wiki Text replicates that exact bug. If there is something Parse Wiki Text doesn't parse exactly the same as Mediawiki, please report it as an issue. + +### Speed + +Parse Wiki Text is designed to parse a page in as little time as possible. It parses tens of thousands of pages per second on each processor core and can quickly parse an entire wiki with millions of pages. If there is anything that can be changed to make Parse Wiki Text faster, please report it as an issue. + +### Safety + +Parse Wiki Text is designed to work with untrusted inputs. If any input doesn't parse safely with reasonable resources, please report it as an issue. No unsafe code is used. + +### Platform support + +Parse Wiki Text is designed to run in a wide variety of environments, such as: + +- servers running machine code +- browsers running Web Assembly +- embedded in other programming languages + +Parse Wiki Text can be deployed anywhere with no dependencies. + +## Caution + +Wiki text is a legacy format used by legacy software. Parse Wiki Text is intended only to recover information that has been written for wikis running legacy software, replicating the exact bugs found in the legacy software. Please don't use wiki text as a format for new applications. Wiki text is a horrible format with an astonishing amount of inconsistencies, bad design choices and bugs. For new applications, please use a format that is designed to be easy to process, such as JSON or even better [CBOR](http://cbor.io). See [Wikidata](https://www.wikidata.org/wiki/Wikidata:Main_Page) for an example of a wiki that uses JSON as its format and provides a rich interface for editing data instead of letting people write code. If you need to take information written in wiki text and reuse it in a new application, you can use Parse Wiki Text to convert it to an intermediate format that you can further process into a modern format. + +## Site configuration + +Wiki text has plenty of features that are parsed in a way that depends on the configuration of the wiki. This means the configuration must be known before parsing. + +- External links are parsed only when the scheme of the URI of the link is in the configured list of valid protocols. When the scheme is not valid, the link is parsed as plain text. +- Categories and images superficially look they same way as links, but are parsed differently. These can only be distinguished by knowing the namespace aliases from the configuration of the wiki. +- Text matching the configured set of magic words is parsed as magic words. +- Extension tags have the same syntax as HTML tags, but are parsed differently. The configuration tells which tag names are to be treated as extension tags. + +The configuration can be seen by making a request to the [site info](https://www.mediawiki.org/wiki/API:Siteinfo) resource on the wiki. The utility [Fetch site configuration](https://github.com/portstrom/fetch_mediawiki_configuration) fetches the parts of the configuration needed for parsing pages in the wiki, and outputs Rust code for instantiating a parser with that configuration. Parse Wiki Text contains a default configuration that can be used for testing. + +## Limitations + +Wiki text was never designed to be possible to parse into a structured format. It's designed to be parsed in multiple passes, where each pass depends on the output on the previous pass. Most importantly, templates are expanded in an earlier pass and formatting codes are parsed in a later pass. This means the formatting codes you see in the original text are not necessarily the same as the parser will see after templates have been expanded. Luckily this is as bad for human editors as it is for computers, so people tend to avoid writing templates that cause formatting codes to be parsed in a way that differs from what they would expect from reading the original wiki text before expanding templates. Parse Wiki Text assumes that templates never change the meaning of formatting codes around them. + +## Sandbox + +A sandbox ([Github](https://github.com/portstrom/parse_wiki_text_sandbox), [try online](https://portstrom.com/parse_wiki_text_sandbox/)) is available that allows interactively entering wiki text and inspecting the result of parsing it. + +## Comparison with Mediawiki Parser + +There is another crate called Mediawiki Parser ([crates.io](https://crates.io/crates/mediawiki_parser), [Github](https://github.com/vroland/mediawiki-parser)) that does basically the same thing, parsing wiki text to a tree of elements. That crate however doesn't take into account any of the astonishing amount of weirdness required to correctly parse wiki text. That crate admittedly only parses a subset of wiki text, with the intention to report errors for any text that is too weird to fit that subset, which is a good intention, but when examining it, that subset is quickly found to be too small to parse pages from actual wikis, and even worse, the error reporting is just an empty promise, and there's no indication when a text is incorrectly parsed. + +That crate could possibly be improved to always report errors when a text isn't in the supported subset, but pages found in real wikis very often don't conform to the small subset of wiki text that can be parsed without weirdness, so it still wouldn't be useful. Improving that crate to correctly parse a large enough subset of wiki text would be as much effort as starting over from scratch, which is why Parse Wiki Text was made without taking anything from Mediawiki Parser. Parse Wiki Text aims to correctly parse all wiki text, not just a subset, and report warnings when encountering weirdness that should be avoided. + +## Examples + +The default configuration is used for testing purposes only. +For parsing a real wiki you need a site-specific configuration. +Reuse the same configuration when parsing multiple pages for efficiency. + +```rust +use parse_wiki_text::{Configuration, Node}; +let wiki_text = concat!( + "==Our values==\n", + "*Correctness\n", + "*Speed\n", + "*Ergonomics" +); +let result = Configuration::default().parse(wiki_text); +assert!(result.warnings.is_empty()); +for node in result.nodes { + if let Node::UnorderedList { items, .. } = node { + println!("Our values are:"); + for item in items { + println!("- {}", item.nodes.iter().map(|node| match node { + Node::Text { value, .. } => value, + _ => "" + }).collect::()); + } + } +} +``` diff --git a/parse_wiki_text/src/bold_italic.rs b/parse_wiki_text/src/bold_italic.rs new file mode 100644 index 0000000..e5ac613 --- /dev/null +++ b/parse_wiki_text/src/bold_italic.rs @@ -0,0 +1,33 @@ +// Copyright 2019 Fredrik Portström +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +pub fn parse_bold_italic(state: &mut crate::State) { + let scan_position = state.scan_position; + state.flush(scan_position); + let start_position = state.scan_position; + state.scan_position += 2; + while state.get_byte(state.scan_position) == Some(b'\'') { + state.scan_position += 1; + } + let length = state.scan_position - start_position; + if length < 3 { + state.flushed_position = state.scan_position; + state.nodes.push(crate::Node::Italic { + end: state.flushed_position, + start: start_position, + }); + } else if length < 5 { + state.flushed_position = start_position + 3; + state.nodes.push(crate::Node::Bold { + end: state.flushed_position, + start: start_position, + }); + } else { + state.flushed_position = start_position + 5; + state.nodes.push(crate::Node::BoldItalic { + end: state.flushed_position, + start: start_position, + }); + } +} diff --git a/parse_wiki_text/src/case_folding_simple.rs b/parse_wiki_text/src/case_folding_simple.rs new file mode 100644 index 0000000..3bd48c9 --- /dev/null +++ b/parse_wiki_text/src/case_folding_simple.rs @@ -0,0 +1,2632 @@ +// Copyright 2019 Fredrik Portström +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate case-folding-simple /home/andrew/tmp/ucd-10.0.0/ --chars --all-pairs +// +// ucd-generate is available on crates.io. + +pub const CASE_FOLDING_SIMPLE: &[(char, &[char])] = &[ + ('A', &['a']), + ('B', &['b']), + ('C', &['c']), + ('D', &['d']), + ('E', &['e']), + ('F', &['f']), + ('G', &['g']), + ('H', &['h']), + ('I', &['i']), + ('J', &['j']), + ('K', &['k', 'K']), + ('L', &['l']), + ('M', &['m']), + ('N', &['n']), + ('O', &['o']), + ('P', &['p']), + ('Q', &['q']), + ('R', &['r']), + ('S', &['s', 'ſ']), + ('T', &['t']), + ('U', &['u']), + ('V', &['v']), + ('W', &['w']), + ('X', &['x']), + ('Y', &['y']), + ('Z', &['z']), + ('a', &['A']), + ('b', &['B']), + ('c', &['C']), + ('d', &['D']), + ('e', &['E']), + ('f', &['F']), + ('g', &['G']), + ('h', &['H']), + ('i', &['I']), + ('j', &['J']), + ('k', &['K', 'K']), + ('l', &['L']), + ('m', &['M']), + ('n', &['N']), + ('o', &['O']), + ('p', &['P']), + ('q', &['Q']), + ('r', &['R']), + ('s', &['S', 'ſ']), + ('t', &['T']), + ('u', &['U']), + ('v', &['V']), + ('w', &['W']), + ('x', &['X']), + ('y', &['Y']), + ('z', &['Z']), + ('µ', &['Μ', 'μ']), + ('À', &['à']), + ('Á', &['á']), + ('Â', &['â']), + ('Ã', &['ã']), + ('Ä', &['ä']), + ('Å', &['å', 'Å']), + ('Æ', &['æ']), + ('Ç', &['ç']), + ('È', &['è']), + ('É', &['é']), + ('Ê', &['ê']), + ('Ë', &['ë']), + ('Ì', &['ì']), + ('Í', &['í']), + ('Î', &['î']), + ('Ï', &['ï']), + ('Ð', &['ð']), + ('Ñ', &['ñ']), + ('Ò', &['ò']), + ('Ó', &['ó']), + ('Ô', &['ô']), + ('Õ', &['õ']), + ('Ö', &['ö']), + ('Ø', &['ø']), + ('Ù', &['ù']), + ('Ú', &['ú']), + ('Û', &['û']), + ('Ü', &['ü']), + ('Ý', &['ý']), + ('Þ', &['þ']), + ('ß', &['ẞ']), + ('à', &['À']), + ('á', &['Á']), + ('â', &['Â']), + ('ã', &['Ã']), + ('ä', &['Ä']), + ('å', &['Å', 'Å']), + ('æ', &['Æ']), + ('ç', &['Ç']), + ('è', &['È']), + ('é', &['É']), + ('ê', &['Ê']), + ('ë', &['Ë']), + ('ì', &['Ì']), + ('í', &['Í']), + ('î', &['Î']), + ('ï', &['Ï']), + ('ð', &['Ð']), + ('ñ', &['Ñ']), + ('ò', &['Ò']), + ('ó', &['Ó']), + ('ô', &['Ô']), + ('õ', &['Õ']), + ('ö', &['Ö']), + ('ø', &['Ø']), + ('ù', &['Ù']), + ('ú', &['Ú']), + ('û', &['Û']), + ('ü', &['Ü']), + ('ý', &['Ý']), + ('þ', &['Þ']), + ('ÿ', &['Ÿ']), + ('Ā', &['ā']), + ('ā', &['Ā']), + ('Ă', &['ă']), + ('ă', &['Ă']), + ('Ą', &['ą']), + ('ą', &['Ą']), + ('Ć', &['ć']), + ('ć', &['Ć']), + ('Ĉ', &['ĉ']), + ('ĉ', &['Ĉ']), + ('Ċ', &['ċ']), + ('ċ', &['Ċ']), + ('Č', &['č']), + ('č', &['Č']), + ('Ď', &['ď']), + ('ď', &['Ď']), + ('Đ', &['đ']), + ('đ', &['Đ']), + ('Ē', &['ē']), + ('ē', &['Ē']), + ('Ĕ', &['ĕ']), + ('ĕ', &['Ĕ']), + ('Ė', &['ė']), + ('ė', &['Ė']), + ('Ę', &['ę']), + ('ę', &['Ę']), + ('Ě', &['ě']), + ('ě', &['Ě']), + ('Ĝ', &['ĝ']), + ('ĝ', &['Ĝ']), + ('Ğ', &['ğ']), + ('ğ', &['Ğ']), + ('Ġ', &['ġ']), + ('ġ', &['Ġ']), + ('Ģ', &['ģ']), + ('ģ', &['Ģ']), + ('Ĥ', &['ĥ']), + ('ĥ', &['Ĥ']), + ('Ħ', &['ħ']), + ('ħ', &['Ħ']), + ('Ĩ', &['ĩ']), + ('ĩ', &['Ĩ']), + ('Ī', &['ī']), + ('ī', &['Ī']), + ('Ĭ', &['ĭ']), + ('ĭ', &['Ĭ']), + ('Į', &['į']), + ('į', &['Į']), + ('IJ', &['ij']), + ('ij', &['IJ']), + ('Ĵ', &['ĵ']), + ('ĵ', &['Ĵ']), + ('Ķ', &['ķ']), + ('ķ', &['Ķ']), + ('Ĺ', &['ĺ']), + ('ĺ', &['Ĺ']), + ('Ļ', &['ļ']), + ('ļ', &['Ļ']), + ('Ľ', &['ľ']), + ('ľ', &['Ľ']), + ('Ŀ', &['ŀ']), + ('ŀ', &['Ŀ']), + ('Ł', &['ł']), + ('ł', &['Ł']), + ('Ń', &['ń']), + ('ń', &['Ń']), + ('Ņ', &['ņ']), + ('ņ', &['Ņ']), + ('Ň', &['ň']), + ('ň', &['Ň']), + ('Ŋ', &['ŋ']), + ('ŋ', &['Ŋ']), + ('Ō', &['ō']), + ('ō', &['Ō']), + ('Ŏ', &['ŏ']), + ('ŏ', &['Ŏ']), + ('Ő', &['ő']), + ('ő', &['Ő']), + ('Œ', &['œ']), + ('œ', &['Œ']), + ('Ŕ', &['ŕ']), + ('ŕ', &['Ŕ']), + ('Ŗ', &['ŗ']), + ('ŗ', &['Ŗ']), + ('Ř', &['ř']), + ('ř', &['Ř']), + ('Ś', &['ś']), + ('ś', &['Ś']), + ('Ŝ', &['ŝ']), + ('ŝ', &['Ŝ']), + ('Ş', &['ş']), + ('ş', &['Ş']), + ('Š', &['š']), + ('š', &['Š']), + ('Ţ', &['ţ']), + ('ţ', &['Ţ']), + ('Ť', &['ť']), + ('ť', &['Ť']), + ('Ŧ', &['ŧ']), + ('ŧ', &['Ŧ']), + ('Ũ', &['ũ']), + ('ũ', &['Ũ']), + ('Ū', &['ū']), + ('ū', &['Ū']), + ('Ŭ', &['ŭ']), + ('ŭ', &['Ŭ']), + ('Ů', &['ů']), + ('ů', &['Ů']), + ('Ű', &['ű']), + ('ű', &['Ű']), + ('Ų', &['ų']), + ('ų', &['Ų']), + ('Ŵ', &['ŵ']), + ('ŵ', &['Ŵ']), + ('Ŷ', &['ŷ']), + ('ŷ', &['Ŷ']), + ('Ÿ', &['ÿ']), + ('Ź', &['ź']), + ('ź', &['Ź']), + ('Ż', &['ż']), + ('ż', &['Ż']), + ('Ž', &['ž']), + ('ž', &['Ž']), + ('ſ', &['S', 's']), + ('ƀ', &['Ƀ']), + ('Ɓ', &['ɓ']), + ('Ƃ', &['ƃ']), + ('ƃ', &['Ƃ']), + ('Ƅ', &['ƅ']), + ('ƅ', &['Ƅ']), + ('Ɔ', &['ɔ']), + ('Ƈ', &['ƈ']), + ('ƈ', &['Ƈ']), + ('Ɖ', &['ɖ']), + ('Ɗ', &['ɗ']), + ('Ƌ', &['ƌ']), + ('ƌ', &['Ƌ']), + ('Ǝ', &['ǝ']), + ('Ə', &['ə']), + ('Ɛ', &['ɛ']), + ('Ƒ', &['ƒ']), + ('ƒ', &['Ƒ']), + ('Ɠ', &['ɠ']), + ('Ɣ', &['ɣ']), + ('ƕ', &['Ƕ']), + ('Ɩ', &['ɩ']), + ('Ɨ', &['ɨ']), + ('Ƙ', &['ƙ']), + ('ƙ', &['Ƙ']), + ('ƚ', &['Ƚ']), + ('Ɯ', &['ɯ']), + ('Ɲ', &['ɲ']), + ('ƞ', &['Ƞ']), + ('Ɵ', &['ɵ']), + ('Ơ', &['ơ']), + ('ơ', &['Ơ']), + ('Ƣ', &['ƣ']), + ('ƣ', &['Ƣ']), + ('Ƥ', &['ƥ']), + ('ƥ', &['Ƥ']), + ('Ʀ', &['ʀ']), + ('Ƨ', &['ƨ']), + ('ƨ', &['Ƨ']), + ('Ʃ', &['ʃ']), + ('Ƭ', &['ƭ']), + ('ƭ', &['Ƭ']), + ('Ʈ', &['ʈ']), + ('Ư', &['ư']), + ('ư', &['Ư']), + ('Ʊ', &['ʊ']), + ('Ʋ', &['ʋ']), + ('Ƴ', &['ƴ']), + ('ƴ', &['Ƴ']), + ('Ƶ', &['ƶ']), + ('ƶ', &['Ƶ']), + ('Ʒ', &['ʒ']), + ('Ƹ', &['ƹ']), + ('ƹ', &['Ƹ']), + ('Ƽ', &['ƽ']), + ('ƽ', &['Ƽ']), + ('ƿ', &['Ƿ']), + ('DŽ', &['Dž', 'dž']), + ('Dž', &['DŽ', 'dž']), + ('dž', &['DŽ', 'Dž']), + ('LJ', &['Lj', 'lj']), + ('Lj', &['LJ', 'lj']), + ('lj', &['LJ', 'Lj']), + ('NJ', &['Nj', 'nj']), + ('Nj', &['NJ', 'nj']), + ('nj', &['NJ', 'Nj']), + ('Ǎ', &['ǎ']), + ('ǎ', &['Ǎ']), + ('Ǐ', &['ǐ']), + ('ǐ', &['Ǐ']), + ('Ǒ', &['ǒ']), + ('ǒ', &['Ǒ']), + ('Ǔ', &['ǔ']), + ('ǔ', &['Ǔ']), + ('Ǖ', &['ǖ']), + ('ǖ', &['Ǖ']), + ('Ǘ', &['ǘ']), + ('ǘ', &['Ǘ']), + ('Ǚ', &['ǚ']), + ('ǚ', &['Ǚ']), + ('Ǜ', &['ǜ']), + ('ǜ', &['Ǜ']), + ('ǝ', &['Ǝ']), + ('Ǟ', &['ǟ']), + ('ǟ', &['Ǟ']), + ('Ǡ', &['ǡ']), + ('ǡ', &['Ǡ']), + ('Ǣ', &['ǣ']), + ('ǣ', &['Ǣ']), + ('Ǥ', &['ǥ']), + ('ǥ', &['Ǥ']), + ('Ǧ', &['ǧ']), + ('ǧ', &['Ǧ']), + ('Ǩ', &['ǩ']), + ('ǩ', &['Ǩ']), + ('Ǫ', &['ǫ']), + ('ǫ', &['Ǫ']), + ('Ǭ', &['ǭ']), + ('ǭ', &['Ǭ']), + ('Ǯ', &['ǯ']), + ('ǯ', &['Ǯ']), + ('DZ', &['Dz', 'dz']), + ('Dz', &['DZ', 'dz']), + ('dz', &['DZ', 'Dz']), + ('Ǵ', &['ǵ']), + ('ǵ', &['Ǵ']), + ('Ƕ', &['ƕ']), + ('Ƿ', &['ƿ']), + ('Ǹ', &['ǹ']), + ('ǹ', &['Ǹ']), + ('Ǻ', &['ǻ']), + ('ǻ', &['Ǻ']), + ('Ǽ', &['ǽ']), + ('ǽ', &['Ǽ']), + ('Ǿ', &['ǿ']), + ('ǿ', &['Ǿ']), + ('Ȁ', &['ȁ']), + ('ȁ', &['Ȁ']), + ('Ȃ', &['ȃ']), + ('ȃ', &['Ȃ']), + ('Ȅ', &['ȅ']), + ('ȅ', &['Ȅ']), + ('Ȇ', &['ȇ']), + ('ȇ', &['Ȇ']), + ('Ȉ', &['ȉ']), + ('ȉ', &['Ȉ']), + ('Ȋ', &['ȋ']), + ('ȋ', &['Ȋ']), + ('Ȍ', &['ȍ']), + ('ȍ', &['Ȍ']), + ('Ȏ', &['ȏ']), + ('ȏ', &['Ȏ']), + ('Ȑ', &['ȑ']), + ('ȑ', &['Ȑ']), + ('Ȓ', &['ȓ']), + ('ȓ', &['Ȓ']), + ('Ȕ', &['ȕ']), + ('ȕ', &['Ȕ']), + ('Ȗ', &['ȗ']), + ('ȗ', &['Ȗ']), + ('Ș', &['ș']), + ('ș', &['Ș']), + ('Ț', &['ț']), + ('ț', &['Ț']), + ('Ȝ', &['ȝ']), + ('ȝ', &['Ȝ']), + ('Ȟ', &['ȟ']), + ('ȟ', &['Ȟ']), + ('Ƞ', &['ƞ']), + ('Ȣ', &['ȣ']), + ('ȣ', &['Ȣ']), + ('Ȥ', &['ȥ']), + ('ȥ', &['Ȥ']), + ('Ȧ', &['ȧ']), + ('ȧ', &['Ȧ']), + ('Ȩ', &['ȩ']), + ('ȩ', &['Ȩ']), + ('Ȫ', &['ȫ']), + ('ȫ', &['Ȫ']), + ('Ȭ', &['ȭ']), + ('ȭ', &['Ȭ']), + ('Ȯ', &['ȯ']), + ('ȯ', &['Ȯ']), + ('Ȱ', &['ȱ']), + ('ȱ', &['Ȱ']), + ('Ȳ', &['ȳ']), + ('ȳ', &['Ȳ']), + ('Ⱥ', &['ⱥ']), + ('Ȼ', &['ȼ']), + ('ȼ', &['Ȼ']), + ('Ƚ', &['ƚ']), + ('Ⱦ', &['ⱦ']), + ('ȿ', &['Ȿ']), + ('ɀ', &['Ɀ']), + ('Ɂ', &['ɂ']), + ('ɂ', &['Ɂ']), + ('Ƀ', &['ƀ']), + ('Ʉ', &['ʉ']), + ('Ʌ', &['ʌ']), + ('Ɇ', &['ɇ']), + ('ɇ', &['Ɇ']), + ('Ɉ', &['ɉ']), + ('ɉ', &['Ɉ']), + ('Ɋ', &['ɋ']), + ('ɋ', &['Ɋ']), + ('Ɍ', &['ɍ']), + ('ɍ', &['Ɍ']), + ('Ɏ', &['ɏ']), + ('ɏ', &['Ɏ']), + ('ɐ', &['Ɐ']), + ('ɑ', &['Ɑ']), + ('ɒ', &['Ɒ']), + ('ɓ', &['Ɓ']), + ('ɔ', &['Ɔ']), + ('ɖ', &['Ɖ']), + ('ɗ', &['Ɗ']), + ('ə', &['Ə']), + ('ɛ', &['Ɛ']), + ('ɜ', &['Ɜ']), + ('ɠ', &['Ɠ']), + ('ɡ', &['Ɡ']), + ('ɣ', &['Ɣ']), + ('ɥ', &['Ɥ']), + ('ɦ', &['Ɦ']), + ('ɨ', &['Ɨ']), + ('ɩ', &['Ɩ']), + ('ɪ', &['Ɪ']), + ('ɫ', &['Ɫ']), + ('ɬ', &['Ɬ']), + ('ɯ', &['Ɯ']), + ('ɱ', &['Ɱ']), + ('ɲ', &['Ɲ']), + ('ɵ', &['Ɵ']), + ('ɽ', &['Ɽ']), + ('ʀ', &['Ʀ']), + ('ʃ', &['Ʃ']), + ('ʇ', &['Ʇ']), + ('ʈ', &['Ʈ']), + ('ʉ', &['Ʉ']), + ('ʊ', &['Ʊ']), + ('ʋ', &['Ʋ']), + ('ʌ', &['Ʌ']), + ('ʒ', &['Ʒ']), + ('ʝ', &['Ʝ']), + ('ʞ', &['Ʞ']), + ('ͅ', &['Ι', 'ι', 'ι']), + ('Ͱ', &['ͱ']), + ('ͱ', &['Ͱ']), + ('Ͳ', &['ͳ']), + ('ͳ', &['Ͳ']), + ('Ͷ', &['ͷ']), + ('ͷ', &['Ͷ']), + ('ͻ', &['Ͻ']), + ('ͼ', &['Ͼ']), + ('ͽ', &['Ͽ']), + ('Ϳ', &['ϳ']), + ('Ά', &['ά']), + ('Έ', &['έ']), + ('Ή', &['ή']), + ('Ί', &['ί']), + ('Ό', &['ό']), + ('Ύ', &['ύ']), + ('Ώ', &['ώ']), + ('Α', &['α']), + ('Β', &['β', 'ϐ']), + ('Γ', &['γ']), + ('Δ', &['δ']), + ('Ε', &['ε', 'ϵ']), + ('Ζ', &['ζ']), + ('Η', &['η']), + ('Θ', &['θ', 'ϑ', 'ϴ']), + ('Ι', &['ͅ', 'ι', 'ι']), + ('Κ', &['κ', 'ϰ']), + ('Λ', &['λ']), + ('Μ', &['µ', 'μ']), + ('Ν', &['ν']), + ('Ξ', &['ξ']), + ('Ο', &['ο']), + ('Π', &['π', 'ϖ']), + ('Ρ', &['ρ', 'ϱ']), + ('Σ', &['ς', 'σ']), + ('Τ', &['τ']), + ('Υ', &['υ']), + ('Φ', &['φ', 'ϕ']), + ('Χ', &['χ']), + ('Ψ', &['ψ']), + ('Ω', &['ω', 'Ω']), + ('Ϊ', &['ϊ']), + ('Ϋ', &['ϋ']), + ('ά', &['Ά']), + ('έ', &['Έ']), + ('ή', &['Ή']), + ('ί', &['Ί']), + ('α', &['Α']), + ('β', &['Β', 'ϐ']), + ('γ', &['Γ']), + ('δ', &['Δ']), + ('ε', &['Ε', 'ϵ']), + ('ζ', &['Ζ']), + ('η', &['Η']), + ('θ', &['Θ', 'ϑ', 'ϴ']), + ('ι', &['ͅ', 'Ι', 'ι']), + ('κ', &['Κ', 'ϰ']), + ('λ', &['Λ']), + ('μ', &['µ', 'Μ']), + ('ν', &['Ν']), + ('ξ', &['Ξ']), + ('ο', &['Ο']), + ('π', &['Π', 'ϖ']), + ('ρ', &['Ρ', 'ϱ']), + ('ς', &['Σ', 'σ']), + ('σ', &['Σ', 'ς']), + ('τ', &['Τ']), + ('υ', &['Υ']), + ('φ', &['Φ', 'ϕ']), + ('χ', &['Χ']), + ('ψ', &['Ψ']), + ('ω', &['Ω', 'Ω']), + ('ϊ', &['Ϊ']), + ('ϋ', &['Ϋ']), + ('ό', &['Ό']), + ('ύ', &['Ύ']), + ('ώ', &['Ώ']), + ('Ϗ', &['ϗ']), + ('ϐ', &['Β', 'β']), + ('ϑ', &['Θ', 'θ', 'ϴ']), + ('ϕ', &['Φ', 'φ']), + ('ϖ', &['Π', 'π']), + ('ϗ', &['Ϗ']), + ('Ϙ', &['ϙ']), + ('ϙ', &['Ϙ']), + ('Ϛ', &['ϛ']), + ('ϛ', &['Ϛ']), + ('Ϝ', &['ϝ']), + ('ϝ', &['Ϝ']), + ('Ϟ', &['ϟ']), + ('ϟ', &['Ϟ']), + ('Ϡ', &['ϡ']), + ('ϡ', &['Ϡ']), + ('Ϣ', &['ϣ']), + ('ϣ', &['Ϣ']), + ('Ϥ', &['ϥ']), + ('ϥ', &['Ϥ']), + ('Ϧ', &['ϧ']), + ('ϧ', &['Ϧ']), + ('Ϩ', &['ϩ']), + ('ϩ', &['Ϩ']), + ('Ϫ', &['ϫ']), + ('ϫ', &['Ϫ']), + ('Ϭ', &['ϭ']), + ('ϭ', &['Ϭ']), + ('Ϯ', &['ϯ']), + ('ϯ', &['Ϯ']), + ('ϰ', &['Κ', 'κ']), + ('ϱ', &['Ρ', 'ρ']), + ('ϲ', &['Ϲ']), + ('ϳ', &['Ϳ']), + ('ϴ', &['Θ', 'θ', 'ϑ']), + ('ϵ', &['Ε', 'ε']), + ('Ϸ', &['ϸ']), + ('ϸ', &['Ϸ']), + ('Ϲ', &['ϲ']), + ('Ϻ', &['ϻ']), + ('ϻ', &['Ϻ']), + ('Ͻ', &['ͻ']), + ('Ͼ', &['ͼ']), + ('Ͽ', &['ͽ']), + ('Ѐ', &['ѐ']), + ('Ё', &['ё']), + ('Ђ', &['ђ']), + ('Ѓ', &['ѓ']), + ('Є', &['є']), + ('Ѕ', &['ѕ']), + ('І', &['і']), + ('Ї', &['ї']), + ('Ј', &['ј']), + ('Љ', &['љ']), + ('Њ', &['њ']), + ('Ћ', &['ћ']), + ('Ќ', &['ќ']), + ('Ѝ', &['ѝ']), + ('Ў', &['ў']), + ('Џ', &['џ']), + ('А', &['а']), + ('Б', &['б']), + ('В', &['в', 'ᲀ']), + ('Г', &['г']), + ('Д', &['д', 'ᲁ']), + ('Е', &['е']), + ('Ж', &['ж']), + ('З', &['з']), + ('И', &['и']), + ('Й', &['й']), + ('К', &['к']), + ('Л', &['л']), + ('М', &['м']), + ('Н', &['н']), + ('О', &['о', 'ᲂ']), + ('П', &['п']), + ('Р', &['р']), + ('С', &['с', 'ᲃ']), + ('Т', &['т', 'ᲄ', 'ᲅ']), + ('У', &['у']), + ('Ф', &['ф']), + ('Х', &['х']), + ('Ц', &['ц']), + ('Ч', &['ч']), + ('Ш', &['ш']), + ('Щ', &['щ']), + ('Ъ', &['ъ', 'ᲆ']), + ('Ы', &['ы']), + ('Ь', &['ь']), + ('Э', &['э']), + ('Ю', &['ю']), + ('Я', &['я']), + ('а', &['А']), + ('б', &['Б']), + ('в', &['В', 'ᲀ']), + ('г', &['Г']), + ('д', &['Д', 'ᲁ']), + ('е', &['Е']), + ('ж', &['Ж']), + ('з', &['З']), + ('и', &['И']), + ('й', &['Й']), + ('к', &['К']), + ('л', &['Л']), + ('м', &['М']), + ('н', &['Н']), + ('о', &['О', 'ᲂ']), + ('п', &['П']), + ('р', &['Р']), + ('с', &['С', 'ᲃ']), + ('т', &['Т', 'ᲄ', 'ᲅ']), + ('у', &['У']), + ('ф', &['Ф']), + ('х', &['Х']), + ('ц', &['Ц']), + ('ч', &['Ч']), + ('ш', &['Ш']), + ('щ', &['Щ']), + ('ъ', &['Ъ', 'ᲆ']), + ('ы', &['Ы']), + ('ь', &['Ь']), + ('э', &['Э']), + ('ю', &['Ю']), + ('я', &['Я']), + ('ѐ', &['Ѐ']), + ('ё', &['Ё']), + ('ђ', &['Ђ']), + ('ѓ', &['Ѓ']), + ('є', &['Є']), + ('ѕ', &['Ѕ']), + ('і', &['І']), + ('ї', &['Ї']), + ('ј', &['Ј']), + ('љ', &['Љ']), + ('њ', &['Њ']), + ('ћ', &['Ћ']), + ('ќ', &['Ќ']), + ('ѝ', &['Ѝ']), + ('ў', &['Ў']), + ('џ', &['Џ']), + ('Ѡ', &['ѡ']), + ('ѡ', &['Ѡ']), + ('Ѣ', &['ѣ', 'ᲇ']), + ('ѣ', &['Ѣ', 'ᲇ']), + ('Ѥ', &['ѥ']), + ('ѥ', &['Ѥ']), + ('Ѧ', &['ѧ']), + ('ѧ', &['Ѧ']), + ('Ѩ', &['ѩ']), + ('ѩ', &['Ѩ']), + ('Ѫ', &['ѫ']), + ('ѫ', &['Ѫ']), + ('Ѭ', &['ѭ']), + ('ѭ', &['Ѭ']), + ('Ѯ', &['ѯ']), + ('ѯ', &['Ѯ']), + ('Ѱ', &['ѱ']), + ('ѱ', &['Ѱ']), + ('Ѳ', &['ѳ']), + ('ѳ', &['Ѳ']), + ('Ѵ', &['ѵ']), + ('ѵ', &['Ѵ']), + ('Ѷ', &['ѷ']), + ('ѷ', &['Ѷ']), + ('Ѹ', &['ѹ']), + ('ѹ', &['Ѹ']), + ('Ѻ', &['ѻ']), + ('ѻ', &['Ѻ']), + ('Ѽ', &['ѽ']), + ('ѽ', &['Ѽ']), + ('Ѿ', &['ѿ']), + ('ѿ', &['Ѿ']), + ('Ҁ', &['ҁ']), + ('ҁ', &['Ҁ']), + ('Ҋ', &['ҋ']), + ('ҋ', &['Ҋ']), + ('Ҍ', &['ҍ']), + ('ҍ', &['Ҍ']), + ('Ҏ', &['ҏ']), + ('ҏ', &['Ҏ']), + ('Ґ', &['ґ']), + ('ґ', &['Ґ']), + ('Ғ', &['ғ']), + ('ғ', &['Ғ']), + ('Ҕ', &['ҕ']), + ('ҕ', &['Ҕ']), + ('Җ', &['җ']), + ('җ', &['Җ']), + ('Ҙ', &['ҙ']), + ('ҙ', &['Ҙ']), + ('Қ', &['қ']), + ('қ', &['Қ']), + ('Ҝ', &['ҝ']), + ('ҝ', &['Ҝ']), + ('Ҟ', &['ҟ']), + ('ҟ', &['Ҟ']), + ('Ҡ', &['ҡ']), + ('ҡ', &['Ҡ']), + ('Ң', &['ң']), + ('ң', &['Ң']), + ('Ҥ', &['ҥ']), + ('ҥ', &['Ҥ']), + ('Ҧ', &['ҧ']), + ('ҧ', &['Ҧ']), + ('Ҩ', &['ҩ']), + ('ҩ', &['Ҩ']), + ('Ҫ', &['ҫ']), + ('ҫ', &['Ҫ']), + ('Ҭ', &['ҭ']), + ('ҭ', &['Ҭ']), + ('Ү', &['ү']), + ('ү', &['Ү']), + ('Ұ', &['ұ']), + ('ұ', &['Ұ']), + ('Ҳ', &['ҳ']), + ('ҳ', &['Ҳ']), + ('Ҵ', &['ҵ']), + ('ҵ', &['Ҵ']), + ('Ҷ', &['ҷ']), + ('ҷ', &['Ҷ']), + ('Ҹ', &['ҹ']), + ('ҹ', &['Ҹ']), + ('Һ', &['һ']), + ('һ', &['Һ']), + ('Ҽ', &['ҽ']), + ('ҽ', &['Ҽ']), + ('Ҿ', &['ҿ']), + ('ҿ', &['Ҿ']), + ('Ӏ', &['ӏ']), + ('Ӂ', &['ӂ']), + ('ӂ', &['Ӂ']), + ('Ӄ', &['ӄ']), + ('ӄ', &['Ӄ']), + ('Ӆ', &['ӆ']), + ('ӆ', &['Ӆ']), + ('Ӈ', &['ӈ']), + ('ӈ', &['Ӈ']), + ('Ӊ', &['ӊ']), + ('ӊ', &['Ӊ']), + ('Ӌ', &['ӌ']), + ('ӌ', &['Ӌ']), + ('Ӎ', &['ӎ']), + ('ӎ', &['Ӎ']), + ('ӏ', &['Ӏ']), + ('Ӑ', &['ӑ']), + ('ӑ', &['Ӑ']), + ('Ӓ', &['ӓ']), + ('ӓ', &['Ӓ']), + ('Ӕ', &['ӕ']), + ('ӕ', &['Ӕ']), + ('Ӗ', &['ӗ']), + ('ӗ', &['Ӗ']), + ('Ә', &['ә']), + ('ә', &['Ә']), + ('Ӛ', &['ӛ']), + ('ӛ', &['Ӛ']), + ('Ӝ', &['ӝ']), + ('ӝ', &['Ӝ']), + ('Ӟ', &['ӟ']), + ('ӟ', &['Ӟ']), + ('Ӡ', &['ӡ']), + ('ӡ', &['Ӡ']), + ('Ӣ', &['ӣ']), + ('ӣ', &['Ӣ']), + ('Ӥ', &['ӥ']), + ('ӥ', &['Ӥ']), + ('Ӧ', &['ӧ']), + ('ӧ', &['Ӧ']), + ('Ө', &['ө']), + ('ө', &['Ө']), + ('Ӫ', &['ӫ']), + ('ӫ', &['Ӫ']), + ('Ӭ', &['ӭ']), + ('ӭ', &['Ӭ']), + ('Ӯ', &['ӯ']), + ('ӯ', &['Ӯ']), + ('Ӱ', &['ӱ']), + ('ӱ', &['Ӱ']), + ('Ӳ', &['ӳ']), + ('ӳ', &['Ӳ']), + ('Ӵ', &['ӵ']), + ('ӵ', &['Ӵ']), + ('Ӷ', &['ӷ']), + ('ӷ', &['Ӷ']), + ('Ӹ', &['ӹ']), + ('ӹ', &['Ӹ']), + ('Ӻ', &['ӻ']), + ('ӻ', &['Ӻ']), + ('Ӽ', &['ӽ']), + ('ӽ', &['Ӽ']), + ('Ӿ', &['ӿ']), + ('ӿ', &['Ӿ']), + ('Ԁ', &['ԁ']), + ('ԁ', &['Ԁ']), + ('Ԃ', &['ԃ']), + ('ԃ', &['Ԃ']), + ('Ԅ', &['ԅ']), + ('ԅ', &['Ԅ']), + ('Ԇ', &['ԇ']), + ('ԇ', &['Ԇ']), + ('Ԉ', &['ԉ']), + ('ԉ', &['Ԉ']), + ('Ԋ', &['ԋ']), + ('ԋ', &['Ԋ']), + ('Ԍ', &['ԍ']), + ('ԍ', &['Ԍ']), + ('Ԏ', &['ԏ']), + ('ԏ', &['Ԏ']), + ('Ԑ', &['ԑ']), + ('ԑ', &['Ԑ']), + ('Ԓ', &['ԓ']), + ('ԓ', &['Ԓ']), + ('Ԕ', &['ԕ']), + ('ԕ', &['Ԕ']), + ('Ԗ', &['ԗ']), + ('ԗ', &['Ԗ']), + ('Ԙ', &['ԙ']), + ('ԙ', &['Ԙ']), + ('Ԛ', &['ԛ']), + ('ԛ', &['Ԛ']), + ('Ԝ', &['ԝ']), + ('ԝ', &['Ԝ']), + ('Ԟ', &['ԟ']), + ('ԟ', &['Ԟ']), + ('Ԡ', &['ԡ']), + ('ԡ', &['Ԡ']), + ('Ԣ', &['ԣ']), + ('ԣ', &['Ԣ']), + ('Ԥ', &['ԥ']), + ('ԥ', &['Ԥ']), + ('Ԧ', &['ԧ']), + ('ԧ', &['Ԧ']), + ('Ԩ', &['ԩ']), + ('ԩ', &['Ԩ']), + ('Ԫ', &['ԫ']), + ('ԫ', &['Ԫ']), + ('Ԭ', &['ԭ']), + ('ԭ', &['Ԭ']), + ('Ԯ', &['ԯ']), + ('ԯ', &['Ԯ']), + ('Ա', &['ա']), + ('Բ', &['բ']), + ('Գ', &['գ']), + ('Դ', &['դ']), + ('Ե', &['ե']), + ('Զ', &['զ']), + ('Է', &['է']), + ('Ը', &['ը']), + ('Թ', &['թ']), + ('Ժ', &['ժ']), + ('Ի', &['ի']), + ('Լ', &['լ']), + ('Խ', &['խ']), + ('Ծ', &['ծ']), + ('Կ', &['կ']), + ('Հ', &['հ']), + ('Ձ', &['ձ']), + ('Ղ', &['ղ']), + ('Ճ', &['ճ']), + ('Մ', &['մ']), + ('Յ', &['յ']), + ('Ն', &['ն']), + ('Շ', &['շ']), + ('Ո', &['ո']), + ('Չ', &['չ']), + ('Պ', &['պ']), + ('Ջ', &['ջ']), + ('Ռ', &['ռ']), + ('Ս', &['ս']), + ('Վ', &['վ']), + ('Տ', &['տ']), + ('Ր', &['ր']), + ('Ց', &['ց']), + ('Ւ', &['ւ']), + ('Փ', &['փ']), + ('Ք', &['ք']), + ('Օ', &['օ']), + ('Ֆ', &['ֆ']), + ('ա', &['Ա']), + ('բ', &['Բ']), + ('գ', &['Գ']), + ('դ', &['Դ']), + ('ե', &['Ե']), + ('զ', &['Զ']), + ('է', &['Է']), + ('ը', &['Ը']), + ('թ', &['Թ']), + ('ժ', &['Ժ']), + ('ի', &['Ի']), + ('լ', &['Լ']), + ('խ', &['Խ']), + ('ծ', &['Ծ']), + ('կ', &['Կ']), + ('հ', &['Հ']), + ('ձ', &['Ձ']), + ('ղ', &['Ղ']), + ('ճ', &['Ճ']), + ('մ', &['Մ']), + ('յ', &['Յ']), + ('ն', &['Ն']), + ('շ', &['Շ']), + ('ո', &['Ո']), + ('չ', &['Չ']), + ('պ', &['Պ']), + ('ջ', &['Ջ']), + ('ռ', &['Ռ']), + ('ս', &['Ս']), + ('վ', &['Վ']), + ('տ', &['Տ']), + ('ր', &['Ր']), + ('ց', &['Ց']), + ('ւ', &['Ւ']), + ('փ', &['Փ']), + ('ք', &['Ք']), + ('օ', &['Օ']), + ('ֆ', &['Ֆ']), + ('Ⴀ', &['ⴀ']), + ('Ⴁ', &['ⴁ']), + ('Ⴂ', &['ⴂ']), + ('Ⴃ', &['ⴃ']), + ('Ⴄ', &['ⴄ']), + ('Ⴅ', &['ⴅ']), + ('Ⴆ', &['ⴆ']), + ('Ⴇ', &['ⴇ']), + ('Ⴈ', &['ⴈ']), + ('Ⴉ', &['ⴉ']), + ('Ⴊ', &['ⴊ']), + ('Ⴋ', &['ⴋ']), + ('Ⴌ', &['ⴌ']), + ('Ⴍ', &['ⴍ']), + ('Ⴎ', &['ⴎ']), + ('Ⴏ', &['ⴏ']), + ('Ⴐ', &['ⴐ']), + ('Ⴑ', &['ⴑ']), + ('Ⴒ', &['ⴒ']), + ('Ⴓ', &['ⴓ']), + ('Ⴔ', &['ⴔ']), + ('Ⴕ', &['ⴕ']), + ('Ⴖ', &['ⴖ']), + ('Ⴗ', &['ⴗ']), + ('Ⴘ', &['ⴘ']), + ('Ⴙ', &['ⴙ']), + ('Ⴚ', &['ⴚ']), + ('Ⴛ', &['ⴛ']), + ('Ⴜ', &['ⴜ']), + ('Ⴝ', &['ⴝ']), + ('Ⴞ', &['ⴞ']), + ('Ⴟ', &['ⴟ']), + ('Ⴠ', &['ⴠ']), + ('Ⴡ', &['ⴡ']), + ('Ⴢ', &['ⴢ']), + ('Ⴣ', &['ⴣ']), + ('Ⴤ', &['ⴤ']), + ('Ⴥ', &['ⴥ']), + ('Ⴧ', &['ⴧ']), + ('Ⴭ', &['ⴭ']), + ('Ꭰ', &['ꭰ']), + ('Ꭱ', &['ꭱ']), + ('Ꭲ', &['ꭲ']), + ('Ꭳ', &['ꭳ']), + ('Ꭴ', &['ꭴ']), + ('Ꭵ', &['ꭵ']), + ('Ꭶ', &['ꭶ']), + ('Ꭷ', &['ꭷ']), + ('Ꭸ', &['ꭸ']), + ('Ꭹ', &['ꭹ']), + ('Ꭺ', &['ꭺ']), + ('Ꭻ', &['ꭻ']), + ('Ꭼ', &['ꭼ']), + ('Ꭽ', &['ꭽ']), + ('Ꭾ', &['ꭾ']), + ('Ꭿ', &['ꭿ']), + ('Ꮀ', &['ꮀ']), + ('Ꮁ', &['ꮁ']), + ('Ꮂ', &['ꮂ']), + ('Ꮃ', &['ꮃ']), + ('Ꮄ', &['ꮄ']), + ('Ꮅ', &['ꮅ']), + ('Ꮆ', &['ꮆ']), + ('Ꮇ', &['ꮇ']), + ('Ꮈ', &['ꮈ']), + ('Ꮉ', &['ꮉ']), + ('Ꮊ', &['ꮊ']), + ('Ꮋ', &['ꮋ']), + ('Ꮌ', &['ꮌ']), + ('Ꮍ', &['ꮍ']), + ('Ꮎ', &['ꮎ']), + ('Ꮏ', &['ꮏ']), + ('Ꮐ', &['ꮐ']), + ('Ꮑ', &['ꮑ']), + ('Ꮒ', &['ꮒ']), + ('Ꮓ', &['ꮓ']), + ('Ꮔ', &['ꮔ']), + ('Ꮕ', &['ꮕ']), + ('Ꮖ', &['ꮖ']), + ('Ꮗ', &['ꮗ']), + ('Ꮘ', &['ꮘ']), + ('Ꮙ', &['ꮙ']), + ('Ꮚ', &['ꮚ']), + ('Ꮛ', &['ꮛ']), + ('Ꮜ', &['ꮜ']), + ('Ꮝ', &['ꮝ']), + ('Ꮞ', &['ꮞ']), + ('Ꮟ', &['ꮟ']), + ('Ꮠ', &['ꮠ']), + ('Ꮡ', &['ꮡ']), + ('Ꮢ', &['ꮢ']), + ('Ꮣ', &['ꮣ']), + ('Ꮤ', &['ꮤ']), + ('Ꮥ', &['ꮥ']), + ('Ꮦ', &['ꮦ']), + ('Ꮧ', &['ꮧ']), + ('Ꮨ', &['ꮨ']), + ('Ꮩ', &['ꮩ']), + ('Ꮪ', &['ꮪ']), + ('Ꮫ', &['ꮫ']), + ('Ꮬ', &['ꮬ']), + ('Ꮭ', &['ꮭ']), + ('Ꮮ', &['ꮮ']), + ('Ꮯ', &['ꮯ']), + ('Ꮰ', &['ꮰ']), + ('Ꮱ', &['ꮱ']), + ('Ꮲ', &['ꮲ']), + ('Ꮳ', &['ꮳ']), + ('Ꮴ', &['ꮴ']), + ('Ꮵ', &['ꮵ']), + ('Ꮶ', &['ꮶ']), + ('Ꮷ', &['ꮷ']), + ('Ꮸ', &['ꮸ']), + ('Ꮹ', &['ꮹ']), + ('Ꮺ', &['ꮺ']), + ('Ꮻ', &['ꮻ']), + ('Ꮼ', &['ꮼ']), + ('Ꮽ', &['ꮽ']), + ('Ꮾ', &['ꮾ']), + ('Ꮿ', &['ꮿ']), + ('Ᏸ', &['ᏸ']), + ('Ᏹ', &['ᏹ']), + ('Ᏺ', &['ᏺ']), + ('Ᏻ', &['ᏻ']), + ('Ᏼ', &['ᏼ']), + ('Ᏽ', &['ᏽ']), + ('ᏸ', &['Ᏸ']), + ('ᏹ', &['Ᏹ']), + ('ᏺ', &['Ᏺ']), + ('ᏻ', &['Ᏻ']), + ('ᏼ', &['Ᏼ']), + ('ᏽ', &['Ᏽ']), + ('ᲀ', &['В', 'в']), + ('ᲁ', &['Д', 'д']), + ('ᲂ', &['О', 'о']), + ('ᲃ', &['С', 'с']), + ('ᲄ', &['Т', 'т', 'ᲅ']), + ('ᲅ', &['Т', 'т', 'ᲄ']), + ('ᲆ', &['Ъ', 'ъ']), + ('ᲇ', &['Ѣ', 'ѣ']), + ('ᲈ', &['Ꙋ', 'ꙋ']), + ('ᵹ', &['Ᵹ']), + ('ᵽ', &['Ᵽ']), + ('Ḁ', &['ḁ']), + ('ḁ', &['Ḁ']), + ('Ḃ', &['ḃ']), + ('ḃ', &['Ḃ']), + ('Ḅ', &['ḅ']), + ('ḅ', &['Ḅ']), + ('Ḇ', &['ḇ']), + ('ḇ', &['Ḇ']), + ('Ḉ', &['ḉ']), + ('ḉ', &['Ḉ']), + ('Ḋ', &['ḋ']), + ('ḋ', &['Ḋ']), + ('Ḍ', &['ḍ']), + ('ḍ', &['Ḍ']), + ('Ḏ', &['ḏ']), + ('ḏ', &['Ḏ']), + ('Ḑ', &['ḑ']), + ('ḑ', &['Ḑ']), + ('Ḓ', &['ḓ']), + ('ḓ', &['Ḓ']), + ('Ḕ', &['ḕ']), + ('ḕ', &['Ḕ']), + ('Ḗ', &['ḗ']), + ('ḗ', &['Ḗ']), + ('Ḙ', &['ḙ']), + ('ḙ', &['Ḙ']), + ('Ḛ', &['ḛ']), + ('ḛ', &['Ḛ']), + ('Ḝ', &['ḝ']), + ('ḝ', &['Ḝ']), + ('Ḟ', &['ḟ']), + ('ḟ', &['Ḟ']), + ('Ḡ', &['ḡ']), + ('ḡ', &['Ḡ']), + ('Ḣ', &['ḣ']), + ('ḣ', &['Ḣ']), + ('Ḥ', &['ḥ']), + ('ḥ', &['Ḥ']), + ('Ḧ', &['ḧ']), + ('ḧ', &['Ḧ']), + ('Ḩ', &['ḩ']), + ('ḩ', &['Ḩ']), + ('Ḫ', &['ḫ']), + ('ḫ', &['Ḫ']), + ('Ḭ', &['ḭ']), + ('ḭ', &['Ḭ']), + ('Ḯ', &['ḯ']), + ('ḯ', &['Ḯ']), + ('Ḱ', &['ḱ']), + ('ḱ', &['Ḱ']), + ('Ḳ', &['ḳ']), + ('ḳ', &['Ḳ']), + ('Ḵ', &['ḵ']), + ('ḵ', &['Ḵ']), + ('Ḷ', &['ḷ']), + ('ḷ', &['Ḷ']), + ('Ḹ', &['ḹ']), + ('ḹ', &['Ḹ']), + ('Ḻ', &['ḻ']), + ('ḻ', &['Ḻ']), + ('Ḽ', &['ḽ']), + ('ḽ', &['Ḽ']), + ('Ḿ', &['ḿ']), + ('ḿ', &['Ḿ']), + ('Ṁ', &['ṁ']), + ('ṁ', &['Ṁ']), + ('Ṃ', &['ṃ']), + ('ṃ', &['Ṃ']), + ('Ṅ', &['ṅ']), + ('ṅ', &['Ṅ']), + ('Ṇ', &['ṇ']), + ('ṇ', &['Ṇ']), + ('Ṉ', &['ṉ']), + ('ṉ', &['Ṉ']), + ('Ṋ', &['ṋ']), + ('ṋ', &['Ṋ']), + ('Ṍ', &['ṍ']), + ('ṍ', &['Ṍ']), + ('Ṏ', &['ṏ']), + ('ṏ', &['Ṏ']), + ('Ṑ', &['ṑ']), + ('ṑ', &['Ṑ']), + ('Ṓ', &['ṓ']), + ('ṓ', &['Ṓ']), + ('Ṕ', &['ṕ']), + ('ṕ', &['Ṕ']), + ('Ṗ', &['ṗ']), + ('ṗ', &['Ṗ']), + ('Ṙ', &['ṙ']), + ('ṙ', &['Ṙ']), + ('Ṛ', &['ṛ']), + ('ṛ', &['Ṛ']), + ('Ṝ', &['ṝ']), + ('ṝ', &['Ṝ']), + ('Ṟ', &['ṟ']), + ('ṟ', &['Ṟ']), + ('Ṡ', &['ṡ', 'ẛ']), + ('ṡ', &['Ṡ', 'ẛ']), + ('Ṣ', &['ṣ']), + ('ṣ', &['Ṣ']), + ('Ṥ', &['ṥ']), + ('ṥ', &['Ṥ']), + ('Ṧ', &['ṧ']), + ('ṧ', &['Ṧ']), + ('Ṩ', &['ṩ']), + ('ṩ', &['Ṩ']), + ('Ṫ', &['ṫ']), + ('ṫ', &['Ṫ']), + ('Ṭ', &['ṭ']), + ('ṭ', &['Ṭ']), + ('Ṯ', &['ṯ']), + ('ṯ', &['Ṯ']), + ('Ṱ', &['ṱ']), + ('ṱ', &['Ṱ']), + ('Ṳ', &['ṳ']), + ('ṳ', &['Ṳ']), + ('Ṵ', &['ṵ']), + ('ṵ', &['Ṵ']), + ('Ṷ', &['ṷ']), + ('ṷ', &['Ṷ']), + ('Ṹ', &['ṹ']), + ('ṹ', &['Ṹ']), + ('Ṻ', &['ṻ']), + ('ṻ', &['Ṻ']), + ('Ṽ', &['ṽ']), + ('ṽ', &['Ṽ']), + ('Ṿ', &['ṿ']), + ('ṿ', &['Ṿ']), + ('Ẁ', &['ẁ']), + ('ẁ', &['Ẁ']), + ('Ẃ', &['ẃ']), + ('ẃ', &['Ẃ']), + ('Ẅ', &['ẅ']), + ('ẅ', &['Ẅ']), + ('Ẇ', &['ẇ']), + ('ẇ', &['Ẇ']), + ('Ẉ', &['ẉ']), + ('ẉ', &['Ẉ']), + ('Ẋ', &['ẋ']), + ('ẋ', &['Ẋ']), + ('Ẍ', &['ẍ']), + ('ẍ', &['Ẍ']), + ('Ẏ', &['ẏ']), + ('ẏ', &['Ẏ']), + ('Ẑ', &['ẑ']), + ('ẑ', &['Ẑ']), + ('Ẓ', &['ẓ']), + ('ẓ', &['Ẓ']), + ('Ẕ', &['ẕ']), + ('ẕ', &['Ẕ']), + ('ẛ', &['Ṡ', 'ṡ']), + ('ẞ', &['ß']), + ('Ạ', &['ạ']), + ('ạ', &['Ạ']), + ('Ả', &['ả']), + ('ả', &['Ả']), + ('Ấ', &['ấ']), + ('ấ', &['Ấ']), + ('Ầ', &['ầ']), + ('ầ', &['Ầ']), + ('Ẩ', &['ẩ']), + ('ẩ', &['Ẩ']), + ('Ẫ', &['ẫ']), + ('ẫ', &['Ẫ']), + ('Ậ', &['ậ']), + ('ậ', &['Ậ']), + ('Ắ', &['ắ']), + ('ắ', &['Ắ']), + ('Ằ', &['ằ']), + ('ằ', &['Ằ']), + ('Ẳ', &['ẳ']), + ('ẳ', &['Ẳ']), + ('Ẵ', &['ẵ']), + ('ẵ', &['Ẵ']), + ('Ặ', &['ặ']), + ('ặ', &['Ặ']), + ('Ẹ', &['ẹ']), + ('ẹ', &['Ẹ']), + ('Ẻ', &['ẻ']), + ('ẻ', &['Ẻ']), + ('Ẽ', &['ẽ']), + ('ẽ', &['Ẽ']), + ('Ế', &['ế']), + ('ế', &['Ế']), + ('Ề', &['ề']), + ('ề', &['Ề']), + ('Ể', &['ể']), + ('ể', &['Ể']), + ('Ễ', &['ễ']), + ('ễ', &['Ễ']), + ('Ệ', &['ệ']), + ('ệ', &['Ệ']), + ('Ỉ', &['ỉ']), + ('ỉ', &['Ỉ']), + ('Ị', &['ị']), + ('ị', &['Ị']), + ('Ọ', &['ọ']), + ('ọ', &['Ọ']), + ('Ỏ', &['ỏ']), + ('ỏ', &['Ỏ']), + ('Ố', &['ố']), + ('ố', &['Ố']), + ('Ồ', &['ồ']), + ('ồ', &['Ồ']), + ('Ổ', &['ổ']), + ('ổ', &['Ổ']), + ('Ỗ', &['ỗ']), + ('ỗ', &['Ỗ']), + ('Ộ', &['ộ']), + ('ộ', &['Ộ']), + ('Ớ', &['ớ']), + ('ớ', &['Ớ']), + ('Ờ', &['ờ']), + ('ờ', &['Ờ']), + ('Ở', &['ở']), + ('ở', &['Ở']), + ('Ỡ', &['ỡ']), + ('ỡ', &['Ỡ']), + ('Ợ', &['ợ']), + ('ợ', &['Ợ']), + ('Ụ', &['ụ']), + ('ụ', &['Ụ']), + ('Ủ', &['ủ']), + ('ủ', &['Ủ']), + ('Ứ', &['ứ']), + ('ứ', &['Ứ']), + ('Ừ', &['ừ']), + ('ừ', &['Ừ']), + ('Ử', &['ử']), + ('ử', &['Ử']), + ('Ữ', &['ữ']), + ('ữ', &['Ữ']), + ('Ự', &['ự']), + ('ự', &['Ự']), + ('Ỳ', &['ỳ']), + ('ỳ', &['Ỳ']), + ('Ỵ', &['ỵ']), + ('ỵ', &['Ỵ']), + ('Ỷ', &['ỷ']), + ('ỷ', &['Ỷ']), + ('Ỹ', &['ỹ']), + ('ỹ', &['Ỹ']), + ('Ỻ', &['ỻ']), + ('ỻ', &['Ỻ']), + ('Ỽ', &['ỽ']), + ('ỽ', &['Ỽ']), + ('Ỿ', &['ỿ']), + ('ỿ', &['Ỿ']), + ('ἀ', &['Ἀ']), + ('ἁ', &['Ἁ']), + ('ἂ', &['Ἂ']), + ('ἃ', &['Ἃ']), + ('ἄ', &['Ἄ']), + ('ἅ', &['Ἅ']), + ('ἆ', &['Ἆ']), + ('ἇ', &['Ἇ']), + ('Ἀ', &['ἀ']), + ('Ἁ', &['ἁ']), + ('Ἂ', &['ἂ']), + ('Ἃ', &['ἃ']), + ('Ἄ', &['ἄ']), + ('Ἅ', &['ἅ']), + ('Ἆ', &['ἆ']), + ('Ἇ', &['ἇ']), + ('ἐ', &['Ἐ']), + ('ἑ', &['Ἑ']), + ('ἒ', &['Ἒ']), + ('ἓ', &['Ἓ']), + ('ἔ', &['Ἔ']), + ('ἕ', &['Ἕ']), + ('Ἐ', &['ἐ']), + ('Ἑ', &['ἑ']), + ('Ἒ', &['ἒ']), + ('Ἓ', &['ἓ']), + ('Ἔ', &['ἔ']), + ('Ἕ', &['ἕ']), + ('ἠ', &['Ἠ']), + ('ἡ', &['Ἡ']), + ('ἢ', &['Ἢ']), + ('ἣ', &['Ἣ']), + ('ἤ', &['Ἤ']), + ('ἥ', &['Ἥ']), + ('ἦ', &['Ἦ']), + ('ἧ', &['Ἧ']), + ('Ἠ', &['ἠ']), + ('Ἡ', &['ἡ']), + ('Ἢ', &['ἢ']), + ('Ἣ', &['ἣ']), + ('Ἤ', &['ἤ']), + ('Ἥ', &['ἥ']), + ('Ἦ', &['ἦ']), + ('Ἧ', &['ἧ']), + ('ἰ', &['Ἰ']), + ('ἱ', &['Ἱ']), + ('ἲ', &['Ἲ']), + ('ἳ', &['Ἳ']), + ('ἴ', &['Ἴ']), + ('ἵ', &['Ἵ']), + ('ἶ', &['Ἶ']), + ('ἷ', &['Ἷ']), + ('Ἰ', &['ἰ']), + ('Ἱ', &['ἱ']), + ('Ἲ', &['ἲ']), + ('Ἳ', &['ἳ']), + ('Ἴ', &['ἴ']), + ('Ἵ', &['ἵ']), + ('Ἶ', &['ἶ']), + ('Ἷ', &['ἷ']), + ('ὀ', &['Ὀ']), + ('ὁ', &['Ὁ']), + ('ὂ', &['Ὂ']), + ('ὃ', &['Ὃ']), + ('ὄ', &['Ὄ']), + ('ὅ', &['Ὅ']), + ('Ὀ', &['ὀ']), + ('Ὁ', &['ὁ']), + ('Ὂ', &['ὂ']), + ('Ὃ', &['ὃ']), + ('Ὄ', &['ὄ']), + ('Ὅ', &['ὅ']), + ('ὑ', &['Ὑ']), + ('ὓ', &['Ὓ']), + ('ὕ', &['Ὕ']), + ('ὗ', &['Ὗ']), + ('Ὑ', &['ὑ']), + ('Ὓ', &['ὓ']), + ('Ὕ', &['ὕ']), + ('Ὗ', &['ὗ']), + ('ὠ', &['Ὠ']), + ('ὡ', &['Ὡ']), + ('ὢ', &['Ὢ']), + ('ὣ', &['Ὣ']), + ('ὤ', &['Ὤ']), + ('ὥ', &['Ὥ']), + ('ὦ', &['Ὦ']), + ('ὧ', &['Ὧ']), + ('Ὠ', &['ὠ']), + ('Ὡ', &['ὡ']), + ('Ὢ', &['ὢ']), + ('Ὣ', &['ὣ']), + ('Ὤ', &['ὤ']), + ('Ὥ', &['ὥ']), + ('Ὦ', &['ὦ']), + ('Ὧ', &['ὧ']), + ('ὰ', &['Ὰ']), + ('ά', &['Ά']), + ('ὲ', &['Ὲ']), + ('έ', &['Έ']), + ('ὴ', &['Ὴ']), + ('ή', &['Ή']), + ('ὶ', &['Ὶ']), + ('ί', &['Ί']), + ('ὸ', &['Ὸ']), + ('ό', &['Ό']), + ('ὺ', &['Ὺ']), + ('ύ', &['Ύ']), + ('ὼ', &['Ὼ']), + ('ώ', &['Ώ']), + ('ᾀ', &['ᾈ']), + ('ᾁ', &['ᾉ']), + ('ᾂ', &['ᾊ']), + ('ᾃ', &['ᾋ']), + ('ᾄ', &['ᾌ']), + ('ᾅ', &['ᾍ']), + ('ᾆ', &['ᾎ']), + ('ᾇ', &['ᾏ']), + ('ᾈ', &['ᾀ']), + ('ᾉ', &['ᾁ']), + ('ᾊ', &['ᾂ']), + ('ᾋ', &['ᾃ']), + ('ᾌ', &['ᾄ']), + ('ᾍ', &['ᾅ']), + ('ᾎ', &['ᾆ']), + ('ᾏ', &['ᾇ']), + ('ᾐ', &['ᾘ']), + ('ᾑ', &['ᾙ']), + ('ᾒ', &['ᾚ']), + ('ᾓ', &['ᾛ']), + ('ᾔ', &['ᾜ']), + ('ᾕ', &['ᾝ']), + ('ᾖ', &['ᾞ']), + ('ᾗ', &['ᾟ']), + ('ᾘ', &['ᾐ']), + ('ᾙ', &['ᾑ']), + ('ᾚ', &['ᾒ']), + ('ᾛ', &['ᾓ']), + ('ᾜ', &['ᾔ']), + ('ᾝ', &['ᾕ']), + ('ᾞ', &['ᾖ']), + ('ᾟ', &['ᾗ']), + ('ᾠ', &['ᾨ']), + ('ᾡ', &['ᾩ']), + ('ᾢ', &['ᾪ']), + ('ᾣ', &['ᾫ']), + ('ᾤ', &['ᾬ']), + ('ᾥ', &['ᾭ']), + ('ᾦ', &['ᾮ']), + ('ᾧ', &['ᾯ']), + ('ᾨ', &['ᾠ']), + ('ᾩ', &['ᾡ']), + ('ᾪ', &['ᾢ']), + ('ᾫ', &['ᾣ']), + ('ᾬ', &['ᾤ']), + ('ᾭ', &['ᾥ']), + ('ᾮ', &['ᾦ']), + ('ᾯ', &['ᾧ']), + ('ᾰ', &['Ᾰ']), + ('ᾱ', &['Ᾱ']), + ('ᾳ', &['ᾼ']), + ('Ᾰ', &['ᾰ']), + ('Ᾱ', &['ᾱ']), + ('Ὰ', &['ὰ']), + ('Ά', &['ά']), + ('ᾼ', &['ᾳ']), + ('ι', &['ͅ', 'Ι', 'ι']), + ('ῃ', &['ῌ']), + ('Ὲ', &['ὲ']), + ('Έ', &['έ']), + ('Ὴ', &['ὴ']), + ('Ή', &['ή']), + ('ῌ', &['ῃ']), + ('ῐ', &['Ῐ']), + ('ῑ', &['Ῑ']), + ('Ῐ', &['ῐ']), + ('Ῑ', &['ῑ']), + ('Ὶ', &['ὶ']), + ('Ί', &['ί']), + ('ῠ', &['Ῠ']), + ('ῡ', &['Ῡ']), + ('ῥ', &['Ῥ']), + ('Ῠ', &['ῠ']), + ('Ῡ', &['ῡ']), + ('Ὺ', &['ὺ']), + ('Ύ', &['ύ']), + ('Ῥ', &['ῥ']), + ('ῳ', &['ῼ']), + ('Ὸ', &['ὸ']), + ('Ό', &['ό']), + ('Ὼ', &['ὼ']), + ('Ώ', &['ώ']), + ('ῼ', &['ῳ']), + ('Ω', &['Ω', 'ω']), + ('K', &['K', 'k']), + ('Å', &['Å', 'å']), + ('Ⅎ', &['ⅎ']), + ('ⅎ', &['Ⅎ']), + ('Ⅰ', &['ⅰ']), + ('Ⅱ', &['ⅱ']), + ('Ⅲ', &['ⅲ']), + ('Ⅳ', &['ⅳ']), + ('Ⅴ', &['ⅴ']), + ('Ⅵ', &['ⅵ']), + ('Ⅶ', &['ⅶ']), + ('Ⅷ', &['ⅷ']), + ('Ⅸ', &['ⅸ']), + ('Ⅹ', &['ⅹ']), + ('Ⅺ', &['ⅺ']), + ('Ⅻ', &['ⅻ']), + ('Ⅼ', &['ⅼ']), + ('Ⅽ', &['ⅽ']), + ('Ⅾ', &['ⅾ']), + ('Ⅿ', &['ⅿ']), + ('ⅰ', &['Ⅰ']), + ('ⅱ', &['Ⅱ']), + ('ⅲ', &['Ⅲ']), + ('ⅳ', &['Ⅳ']), + ('ⅴ', &['Ⅴ']), + ('ⅵ', &['Ⅵ']), + ('ⅶ', &['Ⅶ']), + ('ⅷ', &['Ⅷ']), + ('ⅸ', &['Ⅸ']), + ('ⅹ', &['Ⅹ']), + ('ⅺ', &['Ⅺ']), + ('ⅻ', &['Ⅻ']), + ('ⅼ', &['Ⅼ']), + ('ⅽ', &['Ⅽ']), + ('ⅾ', &['Ⅾ']), + ('ⅿ', &['Ⅿ']), + ('Ↄ', &['ↄ']), + ('ↄ', &['Ↄ']), + ('Ⓐ', &['ⓐ']), + ('Ⓑ', &['ⓑ']), + ('Ⓒ', &['ⓒ']), + ('Ⓓ', &['ⓓ']), + ('Ⓔ', &['ⓔ']), + ('Ⓕ', &['ⓕ']), + ('Ⓖ', &['ⓖ']), + ('Ⓗ', &['ⓗ']), + ('Ⓘ', &['ⓘ']), + ('Ⓙ', &['ⓙ']), + ('Ⓚ', &['ⓚ']), + ('Ⓛ', &['ⓛ']), + ('Ⓜ', &['ⓜ']), + ('Ⓝ', &['ⓝ']), + ('Ⓞ', &['ⓞ']), + ('Ⓟ', &['ⓟ']), + ('Ⓠ', &['ⓠ']), + ('Ⓡ', &['ⓡ']), + ('Ⓢ', &['ⓢ']), + ('Ⓣ', &['ⓣ']), + ('Ⓤ', &['ⓤ']), + ('Ⓥ', &['ⓥ']), + ('Ⓦ', &['ⓦ']), + ('Ⓧ', &['ⓧ']), + ('Ⓨ', &['ⓨ']), + ('Ⓩ', &['ⓩ']), + ('ⓐ', &['Ⓐ']), + ('ⓑ', &['Ⓑ']), + ('ⓒ', &['Ⓒ']), + ('ⓓ', &['Ⓓ']), + ('ⓔ', &['Ⓔ']), + ('ⓕ', &['Ⓕ']), + ('ⓖ', &['Ⓖ']), + ('ⓗ', &['Ⓗ']), + ('ⓘ', &['Ⓘ']), + ('ⓙ', &['Ⓙ']), + ('ⓚ', &['Ⓚ']), + ('ⓛ', &['Ⓛ']), + ('ⓜ', &['Ⓜ']), + ('ⓝ', &['Ⓝ']), + ('ⓞ', &['Ⓞ']), + ('ⓟ', &['Ⓟ']), + ('ⓠ', &['Ⓠ']), + ('ⓡ', &['Ⓡ']), + ('ⓢ', &['Ⓢ']), + ('ⓣ', &['Ⓣ']), + ('ⓤ', &['Ⓤ']), + ('ⓥ', &['Ⓥ']), + ('ⓦ', &['Ⓦ']), + ('ⓧ', &['Ⓧ']), + ('ⓨ', &['Ⓨ']), + ('ⓩ', &['Ⓩ']), + ('Ⰰ', &['ⰰ']), + ('Ⰱ', &['ⰱ']), + ('Ⰲ', &['ⰲ']), + ('Ⰳ', &['ⰳ']), + ('Ⰴ', &['ⰴ']), + ('Ⰵ', &['ⰵ']), + ('Ⰶ', &['ⰶ']), + ('Ⰷ', &['ⰷ']), + ('Ⰸ', &['ⰸ']), + ('Ⰹ', &['ⰹ']), + ('Ⰺ', &['ⰺ']), + ('Ⰻ', &['ⰻ']), + ('Ⰼ', &['ⰼ']), + ('Ⰽ', &['ⰽ']), + ('Ⰾ', &['ⰾ']), + ('Ⰿ', &['ⰿ']), + ('Ⱀ', &['ⱀ']), + ('Ⱁ', &['ⱁ']), + ('Ⱂ', &['ⱂ']), + ('Ⱃ', &['ⱃ']), + ('Ⱄ', &['ⱄ']), + ('Ⱅ', &['ⱅ']), + ('Ⱆ', &['ⱆ']), + ('Ⱇ', &['ⱇ']), + ('Ⱈ', &['ⱈ']), + ('Ⱉ', &['ⱉ']), + ('Ⱊ', &['ⱊ']), + ('Ⱋ', &['ⱋ']), + ('Ⱌ', &['ⱌ']), + ('Ⱍ', &['ⱍ']), + ('Ⱎ', &['ⱎ']), + ('Ⱏ', &['ⱏ']), + ('Ⱐ', &['ⱐ']), + ('Ⱑ', &['ⱑ']), + ('Ⱒ', &['ⱒ']), + ('Ⱓ', &['ⱓ']), + ('Ⱔ', &['ⱔ']), + ('Ⱕ', &['ⱕ']), + ('Ⱖ', &['ⱖ']), + ('Ⱗ', &['ⱗ']), + ('Ⱘ', &['ⱘ']), + ('Ⱙ', &['ⱙ']), + ('Ⱚ', &['ⱚ']), + ('Ⱛ', &['ⱛ']), + ('Ⱜ', &['ⱜ']), + ('Ⱝ', &['ⱝ']), + ('Ⱞ', &['ⱞ']), + ('ⰰ', &['Ⰰ']), + ('ⰱ', &['Ⰱ']), + ('ⰲ', &['Ⰲ']), + ('ⰳ', &['Ⰳ']), + ('ⰴ', &['Ⰴ']), + ('ⰵ', &['Ⰵ']), + ('ⰶ', &['Ⰶ']), + ('ⰷ', &['Ⰷ']), + ('ⰸ', &['Ⰸ']), + ('ⰹ', &['Ⰹ']), + ('ⰺ', &['Ⰺ']), + ('ⰻ', &['Ⰻ']), + ('ⰼ', &['Ⰼ']), + ('ⰽ', &['Ⰽ']), + ('ⰾ', &['Ⰾ']), + ('ⰿ', &['Ⰿ']), + ('ⱀ', &['Ⱀ']), + ('ⱁ', &['Ⱁ']), + ('ⱂ', &['Ⱂ']), + ('ⱃ', &['Ⱃ']), + ('ⱄ', &['Ⱄ']), + ('ⱅ', &['Ⱅ']), + ('ⱆ', &['Ⱆ']), + ('ⱇ', &['Ⱇ']), + ('ⱈ', &['Ⱈ']), + ('ⱉ', &['Ⱉ']), + ('ⱊ', &['Ⱊ']), + ('ⱋ', &['Ⱋ']), + ('ⱌ', &['Ⱌ']), + ('ⱍ', &['Ⱍ']), + ('ⱎ', &['Ⱎ']), + ('ⱏ', &['Ⱏ']), + ('ⱐ', &['Ⱐ']), + ('ⱑ', &['Ⱑ']), + ('ⱒ', &['Ⱒ']), + ('ⱓ', &['Ⱓ']), + ('ⱔ', &['Ⱔ']), + ('ⱕ', &['Ⱕ']), + ('ⱖ', &['Ⱖ']), + ('ⱗ', &['Ⱗ']), + ('ⱘ', &['Ⱘ']), + ('ⱙ', &['Ⱙ']), + ('ⱚ', &['Ⱚ']), + ('ⱛ', &['Ⱛ']), + ('ⱜ', &['Ⱜ']), + ('ⱝ', &['Ⱝ']), + ('ⱞ', &['Ⱞ']), + ('Ⱡ', &['ⱡ']), + ('ⱡ', &['Ⱡ']), + ('Ɫ', &['ɫ']), + ('Ᵽ', &['ᵽ']), + ('Ɽ', &['ɽ']), + ('ⱥ', &['Ⱥ']), + ('ⱦ', &['Ⱦ']), + ('Ⱨ', &['ⱨ']), + ('ⱨ', &['Ⱨ']), + ('Ⱪ', &['ⱪ']), + ('ⱪ', &['Ⱪ']), + ('Ⱬ', &['ⱬ']), + ('ⱬ', &['Ⱬ']), + ('Ɑ', &['ɑ']), + ('Ɱ', &['ɱ']), + ('Ɐ', &['ɐ']), + ('Ɒ', &['ɒ']), + ('Ⱳ', &['ⱳ']), + ('ⱳ', &['Ⱳ']), + ('Ⱶ', &['ⱶ']), + ('ⱶ', &['Ⱶ']), + ('Ȿ', &['ȿ']), + ('Ɀ', &['ɀ']), + ('Ⲁ', &['ⲁ']), + ('ⲁ', &['Ⲁ']), + ('Ⲃ', &['ⲃ']), + ('ⲃ', &['Ⲃ']), + ('Ⲅ', &['ⲅ']), + ('ⲅ', &['Ⲅ']), + ('Ⲇ', &['ⲇ']), + ('ⲇ', &['Ⲇ']), + ('Ⲉ', &['ⲉ']), + ('ⲉ', &['Ⲉ']), + ('Ⲋ', &['ⲋ']), + ('ⲋ', &['Ⲋ']), + ('Ⲍ', &['ⲍ']), + ('ⲍ', &['Ⲍ']), + ('Ⲏ', &['ⲏ']), + ('ⲏ', &['Ⲏ']), + ('Ⲑ', &['ⲑ']), + ('ⲑ', &['Ⲑ']), + ('Ⲓ', &['ⲓ']), + ('ⲓ', &['Ⲓ']), + ('Ⲕ', &['ⲕ']), + ('ⲕ', &['Ⲕ']), + ('Ⲗ', &['ⲗ']), + ('ⲗ', &['Ⲗ']), + ('Ⲙ', &['ⲙ']), + ('ⲙ', &['Ⲙ']), + ('Ⲛ', &['ⲛ']), + ('ⲛ', &['Ⲛ']), + ('Ⲝ', &['ⲝ']), + ('ⲝ', &['Ⲝ']), + ('Ⲟ', &['ⲟ']), + ('ⲟ', &['Ⲟ']), + ('Ⲡ', &['ⲡ']), + ('ⲡ', &['Ⲡ']), + ('Ⲣ', &['ⲣ']), + ('ⲣ', &['Ⲣ']), + ('Ⲥ', &['ⲥ']), + ('ⲥ', &['Ⲥ']), + ('Ⲧ', &['ⲧ']), + ('ⲧ', &['Ⲧ']), + ('Ⲩ', &['ⲩ']), + ('ⲩ', &['Ⲩ']), + ('Ⲫ', &['ⲫ']), + ('ⲫ', &['Ⲫ']), + ('Ⲭ', &['ⲭ']), + ('ⲭ', &['Ⲭ']), + ('Ⲯ', &['ⲯ']), + ('ⲯ', &['Ⲯ']), + ('Ⲱ', &['ⲱ']), + ('ⲱ', &['Ⲱ']), + ('Ⲳ', &['ⲳ']), + ('ⲳ', &['Ⲳ']), + ('Ⲵ', &['ⲵ']), + ('ⲵ', &['Ⲵ']), + ('Ⲷ', &['ⲷ']), + ('ⲷ', &['Ⲷ']), + ('Ⲹ', &['ⲹ']), + ('ⲹ', &['Ⲹ']), + ('Ⲻ', &['ⲻ']), + ('ⲻ', &['Ⲻ']), + ('Ⲽ', &['ⲽ']), + ('ⲽ', &['Ⲽ']), + ('Ⲿ', &['ⲿ']), + ('ⲿ', &['Ⲿ']), + ('Ⳁ', &['ⳁ']), + ('ⳁ', &['Ⳁ']), + ('Ⳃ', &['ⳃ']), + ('ⳃ', &['Ⳃ']), + ('Ⳅ', &['ⳅ']), + ('ⳅ', &['Ⳅ']), + ('Ⳇ', &['ⳇ']), + ('ⳇ', &['Ⳇ']), + ('Ⳉ', &['ⳉ']), + ('ⳉ', &['Ⳉ']), + ('Ⳋ', &['ⳋ']), + ('ⳋ', &['Ⳋ']), + ('Ⳍ', &['ⳍ']), + ('ⳍ', &['Ⳍ']), + ('Ⳏ', &['ⳏ']), + ('ⳏ', &['Ⳏ']), + ('Ⳑ', &['ⳑ']), + ('ⳑ', &['Ⳑ']), + ('Ⳓ', &['ⳓ']), + ('ⳓ', &['Ⳓ']), + ('Ⳕ', &['ⳕ']), + ('ⳕ', &['Ⳕ']), + ('Ⳗ', &['ⳗ']), + ('ⳗ', &['Ⳗ']), + ('Ⳙ', &['ⳙ']), + ('ⳙ', &['Ⳙ']), + ('Ⳛ', &['ⳛ']), + ('ⳛ', &['Ⳛ']), + ('Ⳝ', &['ⳝ']), + ('ⳝ', &['Ⳝ']), + ('Ⳟ', &['ⳟ']), + ('ⳟ', &['Ⳟ']), + ('Ⳡ', &['ⳡ']), + ('ⳡ', &['Ⳡ']), + ('Ⳣ', &['ⳣ']), + ('ⳣ', &['Ⳣ']), + ('Ⳬ', &['ⳬ']), + ('ⳬ', &['Ⳬ']), + ('Ⳮ', &['ⳮ']), + ('ⳮ', &['Ⳮ']), + ('Ⳳ', &['ⳳ']), + ('ⳳ', &['Ⳳ']), + ('ⴀ', &['Ⴀ']), + ('ⴁ', &['Ⴁ']), + ('ⴂ', &['Ⴂ']), + ('ⴃ', &['Ⴃ']), + ('ⴄ', &['Ⴄ']), + ('ⴅ', &['Ⴅ']), + ('ⴆ', &['Ⴆ']), + ('ⴇ', &['Ⴇ']), + ('ⴈ', &['Ⴈ']), + ('ⴉ', &['Ⴉ']), + ('ⴊ', &['Ⴊ']), + ('ⴋ', &['Ⴋ']), + ('ⴌ', &['Ⴌ']), + ('ⴍ', &['Ⴍ']), + ('ⴎ', &['Ⴎ']), + ('ⴏ', &['Ⴏ']), + ('ⴐ', &['Ⴐ']), + ('ⴑ', &['Ⴑ']), + ('ⴒ', &['Ⴒ']), + ('ⴓ', &['Ⴓ']), + ('ⴔ', &['Ⴔ']), + ('ⴕ', &['Ⴕ']), + ('ⴖ', &['Ⴖ']), + ('ⴗ', &['Ⴗ']), + ('ⴘ', &['Ⴘ']), + ('ⴙ', &['Ⴙ']), + ('ⴚ', &['Ⴚ']), + ('ⴛ', &['Ⴛ']), + ('ⴜ', &['Ⴜ']), + ('ⴝ', &['Ⴝ']), + ('ⴞ', &['Ⴞ']), + ('ⴟ', &['Ⴟ']), + ('ⴠ', &['Ⴠ']), + ('ⴡ', &['Ⴡ']), + ('ⴢ', &['Ⴢ']), + ('ⴣ', &['Ⴣ']), + ('ⴤ', &['Ⴤ']), + ('ⴥ', &['Ⴥ']), + ('ⴧ', &['Ⴧ']), + ('ⴭ', &['Ⴭ']), + ('Ꙁ', &['ꙁ']), + ('ꙁ', &['Ꙁ']), + ('Ꙃ', &['ꙃ']), + ('ꙃ', &['Ꙃ']), + ('Ꙅ', &['ꙅ']), + ('ꙅ', &['Ꙅ']), + ('Ꙇ', &['ꙇ']), + ('ꙇ', &['Ꙇ']), + ('Ꙉ', &['ꙉ']), + ('ꙉ', &['Ꙉ']), + ('Ꙋ', &['ᲈ', 'ꙋ']), + ('ꙋ', &['ᲈ', 'Ꙋ']), + ('Ꙍ', &['ꙍ']), + ('ꙍ', &['Ꙍ']), + ('Ꙏ', &['ꙏ']), + ('ꙏ', &['Ꙏ']), + ('Ꙑ', &['ꙑ']), + ('ꙑ', &['Ꙑ']), + ('Ꙓ', &['ꙓ']), + ('ꙓ', &['Ꙓ']), + ('Ꙕ', &['ꙕ']), + ('ꙕ', &['Ꙕ']), + ('Ꙗ', &['ꙗ']), + ('ꙗ', &['Ꙗ']), + ('Ꙙ', &['ꙙ']), + ('ꙙ', &['Ꙙ']), + ('Ꙛ', &['ꙛ']), + ('ꙛ', &['Ꙛ']), + ('Ꙝ', &['ꙝ']), + ('ꙝ', &['Ꙝ']), + ('Ꙟ', &['ꙟ']), + ('ꙟ', &['Ꙟ']), + ('Ꙡ', &['ꙡ']), + ('ꙡ', &['Ꙡ']), + ('Ꙣ', &['ꙣ']), + ('ꙣ', &['Ꙣ']), + ('Ꙥ', &['ꙥ']), + ('ꙥ', &['Ꙥ']), + ('Ꙧ', &['ꙧ']), + ('ꙧ', &['Ꙧ']), + ('Ꙩ', &['ꙩ']), + ('ꙩ', &['Ꙩ']), + ('Ꙫ', &['ꙫ']), + ('ꙫ', &['Ꙫ']), + ('Ꙭ', &['ꙭ']), + ('ꙭ', &['Ꙭ']), + ('Ꚁ', &['ꚁ']), + ('ꚁ', &['Ꚁ']), + ('Ꚃ', &['ꚃ']), + ('ꚃ', &['Ꚃ']), + ('Ꚅ', &['ꚅ']), + ('ꚅ', &['Ꚅ']), + ('Ꚇ', &['ꚇ']), + ('ꚇ', &['Ꚇ']), + ('Ꚉ', &['ꚉ']), + ('ꚉ', &['Ꚉ']), + ('Ꚋ', &['ꚋ']), + ('ꚋ', &['Ꚋ']), + ('Ꚍ', &['ꚍ']), + ('ꚍ', &['Ꚍ']), + ('Ꚏ', &['ꚏ']), + ('ꚏ', &['Ꚏ']), + ('Ꚑ', &['ꚑ']), + ('ꚑ', &['Ꚑ']), + ('Ꚓ', &['ꚓ']), + ('ꚓ', &['Ꚓ']), + ('Ꚕ', &['ꚕ']), + ('ꚕ', &['Ꚕ']), + ('Ꚗ', &['ꚗ']), + ('ꚗ', &['Ꚗ']), + ('Ꚙ', &['ꚙ']), + ('ꚙ', &['Ꚙ']), + ('Ꚛ', &['ꚛ']), + ('ꚛ', &['Ꚛ']), + ('Ꜣ', &['ꜣ']), + ('ꜣ', &['Ꜣ']), + ('Ꜥ', &['ꜥ']), + ('ꜥ', &['Ꜥ']), + ('Ꜧ', &['ꜧ']), + ('ꜧ', &['Ꜧ']), + ('Ꜩ', &['ꜩ']), + ('ꜩ', &['Ꜩ']), + ('Ꜫ', &['ꜫ']), + ('ꜫ', &['Ꜫ']), + ('Ꜭ', &['ꜭ']), + ('ꜭ', &['Ꜭ']), + ('Ꜯ', &['ꜯ']), + ('ꜯ', &['Ꜯ']), + ('Ꜳ', &['ꜳ']), + ('ꜳ', &['Ꜳ']), + ('Ꜵ', &['ꜵ']), + ('ꜵ', &['Ꜵ']), + ('Ꜷ', &['ꜷ']), + ('ꜷ', &['Ꜷ']), + ('Ꜹ', &['ꜹ']), + ('ꜹ', &['Ꜹ']), + ('Ꜻ', &['ꜻ']), + ('ꜻ', &['Ꜻ']), + ('Ꜽ', &['ꜽ']), + ('ꜽ', &['Ꜽ']), + ('Ꜿ', &['ꜿ']), + ('ꜿ', &['Ꜿ']), + ('Ꝁ', &['ꝁ']), + ('ꝁ', &['Ꝁ']), + ('Ꝃ', &['ꝃ']), + ('ꝃ', &['Ꝃ']), + ('Ꝅ', &['ꝅ']), + ('ꝅ', &['Ꝅ']), + ('Ꝇ', &['ꝇ']), + ('ꝇ', &['Ꝇ']), + ('Ꝉ', &['ꝉ']), + ('ꝉ', &['Ꝉ']), + ('Ꝋ', &['ꝋ']), + ('ꝋ', &['Ꝋ']), + ('Ꝍ', &['ꝍ']), + ('ꝍ', &['Ꝍ']), + ('Ꝏ', &['ꝏ']), + ('ꝏ', &['Ꝏ']), + ('Ꝑ', &['ꝑ']), + ('ꝑ', &['Ꝑ']), + ('Ꝓ', &['ꝓ']), + ('ꝓ', &['Ꝓ']), + ('Ꝕ', &['ꝕ']), + ('ꝕ', &['Ꝕ']), + ('Ꝗ', &['ꝗ']), + ('ꝗ', &['Ꝗ']), + ('Ꝙ', &['ꝙ']), + ('ꝙ', &['Ꝙ']), + ('Ꝛ', &['ꝛ']), + ('ꝛ', &['Ꝛ']), + ('Ꝝ', &['ꝝ']), + ('ꝝ', &['Ꝝ']), + ('Ꝟ', &['ꝟ']), + ('ꝟ', &['Ꝟ']), + ('Ꝡ', &['ꝡ']), + ('ꝡ', &['Ꝡ']), + ('Ꝣ', &['ꝣ']), + ('ꝣ', &['Ꝣ']), + ('Ꝥ', &['ꝥ']), + ('ꝥ', &['Ꝥ']), + ('Ꝧ', &['ꝧ']), + ('ꝧ', &['Ꝧ']), + ('Ꝩ', &['ꝩ']), + ('ꝩ', &['Ꝩ']), + ('Ꝫ', &['ꝫ']), + ('ꝫ', &['Ꝫ']), + ('Ꝭ', &['ꝭ']), + ('ꝭ', &['Ꝭ']), + ('Ꝯ', &['ꝯ']), + ('ꝯ', &['Ꝯ']), + ('Ꝺ', &['ꝺ']), + ('ꝺ', &['Ꝺ']), + ('Ꝼ', &['ꝼ']), + ('ꝼ', &['Ꝼ']), + ('Ᵹ', &['ᵹ']), + ('Ꝿ', &['ꝿ']), + ('ꝿ', &['Ꝿ']), + ('Ꞁ', &['ꞁ']), + ('ꞁ', &['Ꞁ']), + ('Ꞃ', &['ꞃ']), + ('ꞃ', &['Ꞃ']), + ('Ꞅ', &['ꞅ']), + ('ꞅ', &['Ꞅ']), + ('Ꞇ', &['ꞇ']), + ('ꞇ', &['Ꞇ']), + ('Ꞌ', &['ꞌ']), + ('ꞌ', &['Ꞌ']), + ('Ɥ', &['ɥ']), + ('Ꞑ', &['ꞑ']), + ('ꞑ', &['Ꞑ']), + ('Ꞓ', &['ꞓ']), + ('ꞓ', &['Ꞓ']), + ('Ꞗ', &['ꞗ']), + ('ꞗ', &['Ꞗ']), + ('Ꞙ', &['ꞙ']), + ('ꞙ', &['Ꞙ']), + ('Ꞛ', &['ꞛ']), + ('ꞛ', &['Ꞛ']), + ('Ꞝ', &['ꞝ']), + ('ꞝ', &['Ꞝ']), + ('Ꞟ', &['ꞟ']), + ('ꞟ', &['Ꞟ']), + ('Ꞡ', &['ꞡ']), + ('ꞡ', &['Ꞡ']), + ('Ꞣ', &['ꞣ']), + ('ꞣ', &['Ꞣ']), + ('Ꞥ', &['ꞥ']), + ('ꞥ', &['Ꞥ']), + ('Ꞧ', &['ꞧ']), + ('ꞧ', &['Ꞧ']), + ('Ꞩ', &['ꞩ']), + ('ꞩ', &['Ꞩ']), + ('Ɦ', &['ɦ']), + ('Ɜ', &['ɜ']), + ('Ɡ', &['ɡ']), + ('Ɬ', &['ɬ']), + ('Ɪ', &['ɪ']), + ('Ʞ', &['ʞ']), + ('Ʇ', &['ʇ']), + ('Ʝ', &['ʝ']), + ('Ꭓ', &['ꭓ']), + ('Ꞵ', &['ꞵ']), + ('ꞵ', &['Ꞵ']), + ('Ꞷ', &['ꞷ']), + ('ꞷ', &['Ꞷ']), + ('ꭓ', &['Ꭓ']), + ('ꭰ', &['Ꭰ']), + ('ꭱ', &['Ꭱ']), + ('ꭲ', &['Ꭲ']), + ('ꭳ', &['Ꭳ']), + ('ꭴ', &['Ꭴ']), + ('ꭵ', &['Ꭵ']), + ('ꭶ', &['Ꭶ']), + ('ꭷ', &['Ꭷ']), + ('ꭸ', &['Ꭸ']), + ('ꭹ', &['Ꭹ']), + ('ꭺ', &['Ꭺ']), + ('ꭻ', &['Ꭻ']), + ('ꭼ', &['Ꭼ']), + ('ꭽ', &['Ꭽ']), + ('ꭾ', &['Ꭾ']), + ('ꭿ', &['Ꭿ']), + ('ꮀ', &['Ꮀ']), + ('ꮁ', &['Ꮁ']), + ('ꮂ', &['Ꮂ']), + ('ꮃ', &['Ꮃ']), + ('ꮄ', &['Ꮄ']), + ('ꮅ', &['Ꮅ']), + ('ꮆ', &['Ꮆ']), + ('ꮇ', &['Ꮇ']), + ('ꮈ', &['Ꮈ']), + ('ꮉ', &['Ꮉ']), + ('ꮊ', &['Ꮊ']), + ('ꮋ', &['Ꮋ']), + ('ꮌ', &['Ꮌ']), + ('ꮍ', &['Ꮍ']), + ('ꮎ', &['Ꮎ']), + ('ꮏ', &['Ꮏ']), + ('ꮐ', &['Ꮐ']), + ('ꮑ', &['Ꮑ']), + ('ꮒ', &['Ꮒ']), + ('ꮓ', &['Ꮓ']), + ('ꮔ', &['Ꮔ']), + ('ꮕ', &['Ꮕ']), + ('ꮖ', &['Ꮖ']), + ('ꮗ', &['Ꮗ']), + ('ꮘ', &['Ꮘ']), + ('ꮙ', &['Ꮙ']), + ('ꮚ', &['Ꮚ']), + ('ꮛ', &['Ꮛ']), + ('ꮜ', &['Ꮜ']), + ('ꮝ', &['Ꮝ']), + ('ꮞ', &['Ꮞ']), + ('ꮟ', &['Ꮟ']), + ('ꮠ', &['Ꮠ']), + ('ꮡ', &['Ꮡ']), + ('ꮢ', &['Ꮢ']), + ('ꮣ', &['Ꮣ']), + ('ꮤ', &['Ꮤ']), + ('ꮥ', &['Ꮥ']), + ('ꮦ', &['Ꮦ']), + ('ꮧ', &['Ꮧ']), + ('ꮨ', &['Ꮨ']), + ('ꮩ', &['Ꮩ']), + ('ꮪ', &['Ꮪ']), + ('ꮫ', &['Ꮫ']), + ('ꮬ', &['Ꮬ']), + ('ꮭ', &['Ꮭ']), + ('ꮮ', &['Ꮮ']), + ('ꮯ', &['Ꮯ']), + ('ꮰ', &['Ꮰ']), + ('ꮱ', &['Ꮱ']), + ('ꮲ', &['Ꮲ']), + ('ꮳ', &['Ꮳ']), + ('ꮴ', &['Ꮴ']), + ('ꮵ', &['Ꮵ']), + ('ꮶ', &['Ꮶ']), + ('ꮷ', &['Ꮷ']), + ('ꮸ', &['Ꮸ']), + ('ꮹ', &['Ꮹ']), + ('ꮺ', &['Ꮺ']), + ('ꮻ', &['Ꮻ']), + ('ꮼ', &['Ꮼ']), + ('ꮽ', &['Ꮽ']), + ('ꮾ', &['Ꮾ']), + ('ꮿ', &['Ꮿ']), + ('A', &['a']), + ('B', &['b']), + ('C', &['c']), + ('D', &['d']), + ('E', &['e']), + ('F', &['f']), + ('G', &['g']), + ('H', &['h']), + ('I', &['i']), + ('J', &['j']), + ('K', &['k']), + ('L', &['l']), + ('M', &['m']), + ('N', &['n']), + ('O', &['o']), + ('P', &['p']), + ('Q', &['q']), + ('R', &['r']), + ('S', &['s']), + ('T', &['t']), + ('U', &['u']), + ('V', &['v']), + ('W', &['w']), + ('X', &['x']), + ('Y', &['y']), + ('Z', &['z']), + ('a', &['A']), + ('b', &['B']), + ('c', &['C']), + ('d', &['D']), + ('e', &['E']), + ('f', &['F']), + ('g', &['G']), + ('h', &['H']), + ('i', &['I']), + ('j', &['J']), + ('k', &['K']), + ('l', &['L']), + ('m', &['M']), + ('n', &['N']), + ('o', &['O']), + ('p', &['P']), + ('q', &['Q']), + ('r', &['R']), + ('s', &['S']), + ('t', &['T']), + ('u', &['U']), + ('v', &['V']), + ('w', &['W']), + ('x', &['X']), + ('y', &['Y']), + ('z', &['Z']), + ('𐐀', &['𐐨']), + ('𐐁', &['𐐩']), + ('𐐂', &['𐐪']), + ('𐐃', &['𐐫']), + ('𐐄', &['𐐬']), + ('𐐅', &['𐐭']), + ('𐐆', &['𐐮']), + ('𐐇', &['𐐯']), + ('𐐈', &['𐐰']), + ('𐐉', &['𐐱']), + ('𐐊', &['𐐲']), + ('𐐋', &['𐐳']), + ('𐐌', &['𐐴']), + ('𐐍', &['𐐵']), + ('𐐎', &['𐐶']), + ('𐐏', &['𐐷']), + ('𐐐', &['𐐸']), + ('𐐑', &['𐐹']), + ('𐐒', &['𐐺']), + ('𐐓', &['𐐻']), + ('𐐔', &['𐐼']), + ('𐐕', &['𐐽']), + ('𐐖', &['𐐾']), + ('𐐗', &['𐐿']), + ('𐐘', &['𐑀']), + ('𐐙', &['𐑁']), + ('𐐚', &['𐑂']), + ('𐐛', &['𐑃']), + ('𐐜', &['𐑄']), + ('𐐝', &['𐑅']), + ('𐐞', &['𐑆']), + ('𐐟', &['𐑇']), + ('𐐠', &['𐑈']), + ('𐐡', &['𐑉']), + ('𐐢', &['𐑊']), + ('𐐣', &['𐑋']), + ('𐐤', &['𐑌']), + ('𐐥', &['𐑍']), + ('𐐦', &['𐑎']), + ('𐐧', &['𐑏']), + ('𐐨', &['𐐀']), + ('𐐩', &['𐐁']), + ('𐐪', &['𐐂']), + ('𐐫', &['𐐃']), + ('𐐬', &['𐐄']), + ('𐐭', &['𐐅']), + ('𐐮', &['𐐆']), + ('𐐯', &['𐐇']), + ('𐐰', &['𐐈']), + ('𐐱', &['𐐉']), + ('𐐲', &['𐐊']), + ('𐐳', &['𐐋']), + ('𐐴', &['𐐌']), + ('𐐵', &['𐐍']), + ('𐐶', &['𐐎']), + ('𐐷', &['𐐏']), + ('𐐸', &['𐐐']), + ('𐐹', &['𐐑']), + ('𐐺', &['𐐒']), + ('𐐻', &['𐐓']), + ('𐐼', &['𐐔']), + ('𐐽', &['𐐕']), + ('𐐾', &['𐐖']), + ('𐐿', &['𐐗']), + ('𐑀', &['𐐘']), + ('𐑁', &['𐐙']), + ('𐑂', &['𐐚']), + ('𐑃', &['𐐛']), + ('𐑄', &['𐐜']), + ('𐑅', &['𐐝']), + ('𐑆', &['𐐞']), + ('𐑇', &['𐐟']), + ('𐑈', &['𐐠']), + ('𐑉', &['𐐡']), + ('𐑊', &['𐐢']), + ('𐑋', &['𐐣']), + ('𐑌', &['𐐤']), + ('𐑍', &['𐐥']), + ('𐑎', &['𐐦']), + ('𐑏', &['𐐧']), + ('𐒰', &['𐓘']), + ('𐒱', &['𐓙']), + ('𐒲', &['𐓚']), + ('𐒳', &['𐓛']), + ('𐒴', &['𐓜']), + ('𐒵', &['𐓝']), + ('𐒶', &['𐓞']), + ('𐒷', &['𐓟']), + ('𐒸', &['𐓠']), + ('𐒹', &['𐓡']), + ('𐒺', &['𐓢']), + ('𐒻', &['𐓣']), + ('𐒼', &['𐓤']), + ('𐒽', &['𐓥']), + ('𐒾', &['𐓦']), + ('𐒿', &['𐓧']), + ('𐓀', &['𐓨']), + ('𐓁', &['𐓩']), + ('𐓂', &['𐓪']), + ('𐓃', &['𐓫']), + ('𐓄', &['𐓬']), + ('𐓅', &['𐓭']), + ('𐓆', &['𐓮']), + ('𐓇', &['𐓯']), + ('𐓈', &['𐓰']), + ('𐓉', &['𐓱']), + ('𐓊', &['𐓲']), + ('𐓋', &['𐓳']), + ('𐓌', &['𐓴']), + ('𐓍', &['𐓵']), + ('𐓎', &['𐓶']), + ('𐓏', &['𐓷']), + ('𐓐', &['𐓸']), + ('𐓑', &['𐓹']), + ('𐓒', &['𐓺']), + ('𐓓', &['𐓻']), + ('𐓘', &['𐒰']), + ('𐓙', &['𐒱']), + ('𐓚', &['𐒲']), + ('𐓛', &['𐒳']), + ('𐓜', &['𐒴']), + ('𐓝', &['𐒵']), + ('𐓞', &['𐒶']), + ('𐓟', &['𐒷']), + ('𐓠', &['𐒸']), + ('𐓡', &['𐒹']), + ('𐓢', &['𐒺']), + ('𐓣', &['𐒻']), + ('𐓤', &['𐒼']), + ('𐓥', &['𐒽']), + ('𐓦', &['𐒾']), + ('𐓧', &['𐒿']), + ('𐓨', &['𐓀']), + ('𐓩', &['𐓁']), + ('𐓪', &['𐓂']), + ('𐓫', &['𐓃']), + ('𐓬', &['𐓄']), + ('𐓭', &['𐓅']), + ('𐓮', &['𐓆']), + ('𐓯', &['𐓇']), + ('𐓰', &['𐓈']), + ('𐓱', &['𐓉']), + ('𐓲', &['𐓊']), + ('𐓳', &['𐓋']), + ('𐓴', &['𐓌']), + ('𐓵', &['𐓍']), + ('𐓶', &['𐓎']), + ('𐓷', &['𐓏']), + ('𐓸', &['𐓐']), + ('𐓹', &['𐓑']), + ('𐓺', &['𐓒']), + ('𐓻', &['𐓓']), + ('𐲀', &['𐳀']), + ('𐲁', &['𐳁']), + ('𐲂', &['𐳂']), + ('𐲃', &['𐳃']), + ('𐲄', &['𐳄']), + ('𐲅', &['𐳅']), + ('𐲆', &['𐳆']), + ('𐲇', &['𐳇']), + ('𐲈', &['𐳈']), + ('𐲉', &['𐳉']), + ('𐲊', &['𐳊']), + ('𐲋', &['𐳋']), + ('𐲌', &['𐳌']), + ('𐲍', &['𐳍']), + ('𐲎', &['𐳎']), + ('𐲏', &['𐳏']), + ('𐲐', &['𐳐']), + ('𐲑', &['𐳑']), + ('𐲒', &['𐳒']), + ('𐲓', &['𐳓']), + ('𐲔', &['𐳔']), + ('𐲕', &['𐳕']), + ('𐲖', &['𐳖']), + ('𐲗', &['𐳗']), + ('𐲘', &['𐳘']), + ('𐲙', &['𐳙']), + ('𐲚', &['𐳚']), + ('𐲛', &['𐳛']), + ('𐲜', &['𐳜']), + ('𐲝', &['𐳝']), + ('𐲞', &['𐳞']), + ('𐲟', &['𐳟']), + ('𐲠', &['𐳠']), + ('𐲡', &['𐳡']), + ('𐲢', &['𐳢']), + ('𐲣', &['𐳣']), + ('𐲤', &['𐳤']), + ('𐲥', &['𐳥']), + ('𐲦', &['𐳦']), + ('𐲧', &['𐳧']), + ('𐲨', &['𐳨']), + ('𐲩', &['𐳩']), + ('𐲪', &['𐳪']), + ('𐲫', &['𐳫']), + ('𐲬', &['𐳬']), + ('𐲭', &['𐳭']), + ('𐲮', &['𐳮']), + ('𐲯', &['𐳯']), + ('𐲰', &['𐳰']), + ('𐲱', &['𐳱']), + ('𐲲', &['𐳲']), + ('𐳀', &['𐲀']), + ('𐳁', &['𐲁']), + ('𐳂', &['𐲂']), + ('𐳃', &['𐲃']), + ('𐳄', &['𐲄']), + ('𐳅', &['𐲅']), + ('𐳆', &['𐲆']), + ('𐳇', &['𐲇']), + ('𐳈', &['𐲈']), + ('𐳉', &['𐲉']), + ('𐳊', &['𐲊']), + ('𐳋', &['𐲋']), + ('𐳌', &['𐲌']), + ('𐳍', &['𐲍']), + ('𐳎', &['𐲎']), + ('𐳏', &['𐲏']), + ('𐳐', &['𐲐']), + ('𐳑', &['𐲑']), + ('𐳒', &['𐲒']), + ('𐳓', &['𐲓']), + ('𐳔', &['𐲔']), + ('𐳕', &['𐲕']), + ('𐳖', &['𐲖']), + ('𐳗', &['𐲗']), + ('𐳘', &['𐲘']), + ('𐳙', &['𐲙']), + ('𐳚', &['𐲚']), + ('𐳛', &['𐲛']), + ('𐳜', &['𐲜']), + ('𐳝', &['𐲝']), + ('𐳞', &['𐲞']), + ('𐳟', &['𐲟']), + ('𐳠', &['𐲠']), + ('𐳡', &['𐲡']), + ('𐳢', &['𐲢']), + ('𐳣', &['𐲣']), + ('𐳤', &['𐲤']), + ('𐳥', &['𐲥']), + ('𐳦', &['𐲦']), + ('𐳧', &['𐲧']), + ('𐳨', &['𐲨']), + ('𐳩', &['𐲩']), + ('𐳪', &['𐲪']), + ('𐳫', &['𐲫']), + ('𐳬', &['𐲬']), + ('𐳭', &['𐲭']), + ('𐳮', &['𐲮']), + ('𐳯', &['𐲯']), + ('𐳰', &['𐲰']), + ('𐳱', &['𐲱']), + ('𐳲', &['𐲲']), + ('𑢠', &['𑣀']), + ('𑢡', &['𑣁']), + ('𑢢', &['𑣂']), + ('𑢣', &['𑣃']), + ('𑢤', &['𑣄']), + ('𑢥', &['𑣅']), + ('𑢦', &['𑣆']), + ('𑢧', &['𑣇']), + ('𑢨', &['𑣈']), + ('𑢩', &['𑣉']), + ('𑢪', &['𑣊']), + ('𑢫', &['𑣋']), + ('𑢬', &['𑣌']), + ('𑢭', &['𑣍']), + ('𑢮', &['𑣎']), + ('𑢯', &['𑣏']), + ('𑢰', &['𑣐']), + ('𑢱', &['𑣑']), + ('𑢲', &['𑣒']), + ('𑢳', &['𑣓']), + ('𑢴', &['𑣔']), + ('𑢵', &['𑣕']), + ('𑢶', &['𑣖']), + ('𑢷', &['𑣗']), + ('𑢸', &['𑣘']), + ('𑢹', &['𑣙']), + ('𑢺', &['𑣚']), + ('𑢻', &['𑣛']), + ('𑢼', &['𑣜']), + ('𑢽', &['𑣝']), + ('𑢾', &['𑣞']), + ('𑢿', &['𑣟']), + ('𑣀', &['𑢠']), + ('𑣁', &['𑢡']), + ('𑣂', &['𑢢']), + ('𑣃', &['𑢣']), + ('𑣄', &['𑢤']), + ('𑣅', &['𑢥']), + ('𑣆', &['𑢦']), + ('𑣇', &['𑢧']), + ('𑣈', &['𑢨']), + ('𑣉', &['𑢩']), + ('𑣊', &['𑢪']), + ('𑣋', &['𑢫']), + ('𑣌', &['𑢬']), + ('𑣍', &['𑢭']), + ('𑣎', &['𑢮']), + ('𑣏', &['𑢯']), + ('𑣐', &['𑢰']), + ('𑣑', &['𑢱']), + ('𑣒', &['𑢲']), + ('𑣓', &['𑢳']), + ('𑣔', &['𑢴']), + ('𑣕', &['𑢵']), + ('𑣖', &['𑢶']), + ('𑣗', &['𑢷']), + ('𑣘', &['𑢸']), + ('𑣙', &['𑢹']), + ('𑣚', &['𑢺']), + ('𑣛', &['𑢻']), + ('𑣜', &['𑢼']), + ('𑣝', &['𑢽']), + ('𑣞', &['𑢾']), + ('𑣟', &['𑢿']), + ('𞤀', &['𞤢']), + ('𞤁', &['𞤣']), + ('𞤂', &['𞤤']), + ('𞤃', &['𞤥']), + ('𞤄', &['𞤦']), + ('𞤅', &['𞤧']), + ('𞤆', &['𞤨']), + ('𞤇', &['𞤩']), + ('𞤈', &['𞤪']), + ('𞤉', &['𞤫']), + ('𞤊', &['𞤬']), + ('𞤋', &['𞤭']), + ('𞤌', &['𞤮']), + ('𞤍', &['𞤯']), + ('𞤎', &['𞤰']), + ('𞤏', &['𞤱']), + ('𞤐', &['𞤲']), + ('𞤑', &['𞤳']), + ('𞤒', &['𞤴']), + ('𞤓', &['𞤵']), + ('𞤔', &['𞤶']), + ('𞤕', &['𞤷']), + ('𞤖', &['𞤸']), + ('𞤗', &['𞤹']), + ('𞤘', &['𞤺']), + ('𞤙', &['𞤻']), + ('𞤚', &['𞤼']), + ('𞤛', &['𞤽']), + ('𞤜', &['𞤾']), + ('𞤝', &['𞤿']), + ('𞤞', &['𞥀']), + ('𞤟', &['𞥁']), + ('𞤠', &['𞥂']), + ('𞤡', &['𞥃']), + ('𞤢', &['𞤀']), + ('𞤣', &['𞤁']), + ('𞤤', &['𞤂']), + ('𞤥', &['𞤃']), + ('𞤦', &['𞤄']), + ('𞤧', &['𞤅']), + ('𞤨', &['𞤆']), + ('𞤩', &['𞤇']), + ('𞤪', &['𞤈']), + ('𞤫', &['𞤉']), + ('𞤬', &['𞤊']), + ('𞤭', &['𞤋']), + ('𞤮', &['𞤌']), + ('𞤯', &['𞤍']), + ('𞤰', &['𞤎']), + ('𞤱', &['𞤏']), + ('𞤲', &['𞤐']), + ('𞤳', &['𞤑']), + ('𞤴', &['𞤒']), + ('𞤵', &['𞤓']), + ('𞤶', &['𞤔']), + ('𞤷', &['𞤕']), + ('𞤸', &['𞤖']), + ('𞤹', &['𞤗']), + ('𞤺', &['𞤘']), + ('𞤻', &['𞤙']), + ('𞤼', &['𞤚']), + ('𞤽', &['𞤛']), + ('𞤾', &['𞤜']), + ('𞤿', &['𞤝']), + ('𞥀', &['𞤞']), + ('𞥁', &['𞤟']), + ('𞥂', &['𞤠']), + ('𞥃', &['𞤡']), +]; diff --git a/parse_wiki_text/src/character_entity.rs b/parse_wiki_text/src/character_entity.rs new file mode 100644 index 0000000..018efa0 --- /dev/null +++ b/parse_wiki_text/src/character_entity.rs @@ -0,0 +1,22 @@ +// Copyright 2019 Fredrik Portström +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +pub fn parse_character_entity(state: &mut crate::State, configuration: &crate::Configuration) { + if let Ok((match_length, character)) = configuration + .character_entities + .find(&state.wiki_text[state.scan_position + 1..]) + { + let start_position = state.scan_position; + state.flush(start_position); + state.flushed_position = match_length + start_position + 1; + state.scan_position = state.flushed_position; + state.nodes.push(crate::Node::CharacterEntity { + character, + end: state.scan_position, + start: start_position, + }); + } else { + state.scan_position += 1; + } +} diff --git a/parse_wiki_text/src/comment.rs b/parse_wiki_text/src/comment.rs new file mode 100644 index 0000000..d933c17 --- /dev/null +++ b/parse_wiki_text/src/comment.rs @@ -0,0 +1,109 @@ +// Copyright 2019 Fredrik Portström +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +pub fn parse_comment(state: &mut crate::State) { + let start_position = state.scan_position; + let mut position = start_position; + state.flush(position); + position += 4; + while let Some(character) = state.get_byte(position) { + match character { + b'-' if state.get_byte(position + 1) == Some(b'-') + && state.get_byte(position + 2) == Some(b'>') => + { + position += 3; + break; + } + b'<' if state.get_byte(position + 1) == Some(b'/') => { + if parse_end_tag(state, start_position, position) { + return; + } + position += 2; + continue; + } + _ => {} + } + position += 1; + } + state.flushed_position = position; + state.scan_position = position; + state.nodes.push(crate::Node::Comment { + end: state.scan_position, + start: start_position, + }); +} + +fn parse_end_tag( + state: &mut crate::State, + comment_start_position: usize, + tag_start_position: usize, +) -> bool { + let tag_name_start_position = tag_start_position + 2; + let mut tag_name_end_position = tag_name_start_position; + let tag_end_position = loop { + match state.get_byte(tag_name_end_position) { + None | Some(b'/') | Some(b'<') => return false, + Some(b'\t') | Some(b'\n') | Some(b' ') => { + let tag_end_position = state.skip_whitespace_forwards(tag_name_end_position + 1); + match state.get_byte(tag_end_position) { + Some(b'>') => break tag_end_position, + _ => return false, + } + } + Some(b'>') => break tag_name_end_position, + _ => tag_name_end_position += 1, + } + } + 1; + let tag_name = &state.wiki_text[tag_name_start_position..tag_name_end_position]; + let tag_name = if tag_name.as_bytes().iter().all(u8::is_ascii_lowercase) { + crate::Cow::Borrowed(tag_name) + } else { + tag_name.to_ascii_lowercase().into() + }; + let mut matched_node_index = None; + for (open_node_index, open_node) in state.stack.iter().enumerate().rev() { + if let crate::OpenNodeType::Tag { name, .. } = &open_node.type_ { + if name == &tag_name { + matched_node_index = Some(open_node_index); + break; + } + } + } + match matched_node_index { + None => false, + Some(open_node_index) => { + if open_node_index < state.stack.len() - 1 { + state.warnings.push(crate::Warning { + end: tag_end_position, + message: crate::WarningMessage::MissingEndTagRewinding, + start: tag_start_position, + }); + state.stack.truncate(open_node_index + 2); + let open_node = state.stack.pop().unwrap(); + state.rewind(open_node.nodes, open_node.start); + } else { + state.warnings.push(crate::Warning { + end: tag_end_position, + message: crate::WarningMessage::EndTagInComment, + start: tag_start_position, + }); + state.nodes.push(crate::Node::Comment { + end: tag_start_position, + start: comment_start_position, + }); + let open_node = state.stack.pop().unwrap(); + state.flushed_position = tag_end_position; + state.scan_position = state.flushed_position; + let nodes = std::mem::replace(&mut state.nodes, open_node.nodes); + state.nodes.push(crate::Node::Tag { + end: state.scan_position, + name: tag_name, + nodes, + start: open_node.start, + }); + } + true + } + } +} diff --git a/parse_wiki_text/src/configuration.rs b/parse_wiki_text/src/configuration.rs new file mode 100644 index 0000000..875a69b --- /dev/null +++ b/parse_wiki_text/src/configuration.rs @@ -0,0 +1,164 @@ +// Copyright 2019 Fredrik Portström +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +/// Site specific configuration of a wiki. +/// +/// This is generated using the program [`fetch_mediawiki_configuration`](https://github.com/portstrom/fetch_mediawiki_configuration). +pub struct ConfigurationSource<'a> { + /// Aliases of the category namespace. + pub category_namespaces: &'a [&'a str], + + /// Tag names of extension tags. + pub extension_tags: &'a [&'a str], + + /// Aliases of the file namespace. + pub file_namespaces: &'a [&'a str], + + /// Characters that can appear in link trails. + pub link_trail: &'a str, + + /// Magic words that can appear between `__` and `__`. + pub magic_words: &'a [&'a str], + + /// Protocols that can be used for external links. + pub protocols: &'a [&'a str], + + /// Magic words that can be used for redirects. + pub redirect_magic_words: &'a [&'a str], +} + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub enum Namespace { + Category, + File, +} + +impl crate::Configuration { + /// Allocates and returns a new configuration based on the given site specific configuration. + #[must_use] + pub fn new(source: &ConfigurationSource) -> Self { + let mut configuration = crate::Configuration { + character_entities: crate::Trie::new(), + link_trail_character_set: crate::HashSet::new(), + magic_words: crate::Trie::new(), + namespaces: crate::Trie::new(), + protocols: crate::Trie::new(), + redirect_magic_words: crate::Trie::new(), + tag_name_map: crate::HashMap::new(), + }; + for (name, character) in crate::html_entities::HTML_ENTITIES { + configuration + .character_entities + .add_case_sensitive_term(&format!("{};", name), *character); + } + for character in source.link_trail.chars() { + configuration.link_trail_character_set.insert(character); + } + for protocol in source.protocols { + configuration.protocols.add_term(protocol, ()); + } + for magic_word in source.magic_words { + configuration.magic_words.add_term(magic_word, ()); + } + for namespace in source.category_namespaces { + configuration + .namespaces + .add_term(&format!("{}:", namespace), Namespace::Category); + } + for namespace in source.file_namespaces { + configuration + .namespaces + .add_term(&format!("{}:", namespace), Namespace::File); + } + for redirect_magic_word in source.redirect_magic_words { + configuration + .redirect_magic_words + .add_term(redirect_magic_word, ()); + } + for tag_name in source.extension_tags { + configuration + .tag_name_map + .insert(tag_name.to_string(), crate::TagClass::ExtensionTag); + } + for tag_name in [ + "abbr", + "b", + "bdi", + "bdo", + "blockquote", + "br", + "caption", + "center", + "cite", + "code", + "data", + "dd", + "del", + "dfn", + "div", + "dl", + "dt", + "em", + "font", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "hr", + "i", + "ins", + "kbd", + "li", + "mark", + "ol", + "p", + "pre", + "q", + "rb", + "rp", + "rt", + "ruby", + "s", + "samp", + "small", + "span", + "strike", + "strong", + "sub", + "sup", + "table", + "td", + "th", + "time", + "tr", + "tt", + "u", + "ul", + "var", + "wbr", + ] + .iter() + { + configuration + .tag_name_map + .insert(tag_name.to_string(), crate::TagClass::Tag); + } + configuration + } + + /// Parses wiki text into structured data. + #[must_use] + pub fn parse<'a>(&self, wiki_text: &'a str) -> crate::Output<'a> { + crate::parse::parse(self, wiki_text) + } +} + +impl Default for crate::Configuration { + /// Allocates and returns a configuration suitable for testing and quick and dirty prototyping. For correctly parsing an actual wiki, please get the correct site configuration for that particular wiki. + fn default() -> Self { + crate::default::create_configuration() + } +} diff --git a/parse_wiki_text/src/default.rs b/parse_wiki_text/src/default.rs new file mode 100644 index 0000000..50110e4 --- /dev/null +++ b/parse_wiki_text/src/default.rs @@ -0,0 +1,88 @@ +// Copyright 2019 Fredrik Portström +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +pub fn create_configuration() -> crate::Configuration { + crate::Configuration::new(&crate::ConfigurationSource { + category_namespaces: &["category"], + extension_tags: &[ + "categorytree", + "ce", + "charinsert", + "chem", + "gallery", + "graph", + "hiero", + "imagemap", + "indicator", + "inputbox", + "mapframe", + "maplink", + "math", + "nowiki", + "poem", + "pre", + "ref", + "references", + "score", + "section", + "source", + "syntaxhighlight", + "templatedata", + "timeline", + ], + file_namespaces: &["file", "image"], + link_trail: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", + magic_words: &[ + "DISAMBIG", + "FORCETOC", + "HIDDENCAT", + "INDEX", + "NEWSECTIONLINK", + "NOCC", + "NOCOLLABORATIONHUBTOC", + "NOCONTENTCONVERT", + "NOEDITSECTION", + "NOGALLERY", + "NOGLOBAL", + "NOINDEX", + "NONEWSECTIONLINK", + "NOTC", + "NOTITLECONVERT", + "NOTOC", + "STATICREDIRECT", + "TOC", + ], + protocols: &[ + "//", + "bitcoin:", + "ftp://", + "ftps://", + "geo:", + "git://", + "gopher://", + "http://", + "https://", + "irc://", + "ircs://", + "magnet:", + "mailto:", + "mms://", + "news:", + "nntp://", + "redis://", + "sftp://", + "sip:", + "sips:", + "sms:", + "ssh://", + "svn://", + "tel:", + "telnet://", + "urn:", + "worldwind://", + "xmpp:", + ], + redirect_magic_words: &["REDIRECT"], + }) +} diff --git a/parse_wiki_text/src/external_link.rs b/parse_wiki_text/src/external_link.rs new file mode 100644 index 0000000..ce30f87 --- /dev/null +++ b/parse_wiki_text/src/external_link.rs @@ -0,0 +1,47 @@ +// Copyright 2019 Fredrik Portström +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +pub fn parse_external_link_end<'a>( + state: &mut crate::State<'a>, + start_position: usize, + nodes: Vec>, +) { + let scan_position = state.scan_position; + state.flush(scan_position); + state.scan_position += 1; + state.flushed_position = state.scan_position; + let nodes = std::mem::replace(&mut state.nodes, nodes); + state.nodes.push(crate::Node::ExternalLink { + end: state.scan_position, + nodes, + start: start_position, + }); +} + +pub fn parse_external_link_end_of_line(state: &mut crate::State) { + let end = state.scan_position; + let open_node = state.stack.pop().unwrap(); + state.warnings.push(crate::Warning { + end, + message: crate::WarningMessage::InvalidLinkSyntax, + start: open_node.start, + }); + state.rewind(open_node.nodes, open_node.start); +} + +pub fn parse_external_link_start(state: &mut crate::State, configuration: &crate::Configuration) { + let scheme_start_position = state.scan_position + 1; + match configuration + .protocols + .find(&state.wiki_text[scheme_start_position..]) + { + Err(_) => { + state.scan_position = scheme_start_position; + return; + } + Ok(_) => { + state.push_open_node(crate::OpenNodeType::ExternalLink, scheme_start_position); + } + } +} diff --git a/parse_wiki_text/src/heading.rs b/parse_wiki_text/src/heading.rs new file mode 100644 index 0000000..9c4d647 --- /dev/null +++ b/parse_wiki_text/src/heading.rs @@ -0,0 +1,88 @@ +// Copyright 2019 Fredrik Portström +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +pub fn parse_heading_end(state: &mut crate::State) { + let mut end_position = state.scan_position; + loop { + match state.get_byte(end_position - 1) { + Some(b'\t') | Some(b' ') => end_position -= 1, + _ => break, + } + } + let open_node = state.stack.pop().unwrap(); + if state.get_byte(end_position - 1) != Some(b'=') || end_position < open_node.start + 3 { + state.warnings.push(crate::Warning { + end: end_position, + message: crate::WarningMessage::InvalidHeadingSyntaxRewinding, + start: open_node.start, + }); + state.rewind(open_node.nodes, open_node.start); + return; + } + let start_level = match open_node.type_ { + crate::OpenNodeType::Heading { level } => level, + _ => unreachable!(), + }; + let mut end_level: u8 = 1; + while end_level < start_level + && end_position - end_level as usize > open_node.start + end_level as usize + 2 + && state.get_byte(end_position - end_level as usize - 1) == Some(b'=') + { + end_level += 1; + } + let position = state.skip_whitespace_backwards(end_position - end_level as usize); + if end_level < start_level { + state.warnings.push(crate::Warning { + end: end_position, + message: crate::WarningMessage::UnexpectedHeadingLevelCorrecting, + start: open_node.start, + }); + let inner_start_position = open_node.start + end_level as usize; + if match state.nodes.get_mut(0) { + None => { + state.flushed_position = inner_start_position; + false + } + Some(crate::Node::Text { end, start, value }) => { + *start = inner_start_position; + *value = &state.wiki_text[inner_start_position..*end]; + false + } + Some(_) => true, + } { + let end = state.skip_whitespace_forwards(open_node.start + start_level as usize); + state.nodes.insert( + 0, + crate::Node::Text { + end, + start: inner_start_position, + value: &state.wiki_text[inner_start_position..end], + }, + ); + } + } + state.flush(position); + let nodes = std::mem::replace(&mut state.nodes, open_node.nodes); + state.nodes.push(crate::Node::Heading { + end: end_position, + level: end_level, + nodes, + start: open_node.start, + }); + state.scan_position += 1; + state.skip_empty_lines(); +} + +pub fn parse_heading_start(state: &mut crate::State) { + let mut level = 1; + while state.get_byte(state.scan_position + level) == Some(b'=') && level < 6 { + level += 1; + } + let position = state.skip_whitespace_forwards(state.scan_position + level); + state.flushed_position = position; + state.push_open_node( + crate::OpenNodeType::Heading { level: level as u8 }, + position, + ); +} diff --git a/parse_wiki_text/src/html_entities.rs b/parse_wiki_text/src/html_entities.rs new file mode 100644 index 0000000..6625eb8 --- /dev/null +++ b/parse_wiki_text/src/html_entities.rs @@ -0,0 +1,259 @@ +// Copyright 2019 Fredrik Portström +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +pub const HTML_ENTITIES: &[(&str, char)] = &[ + ("AElig", '\u{c6}'), + ("Aacute", '\u{c1}'), + ("Acirc", '\u{c2}'), + ("Agrav", '\u{c0}'), + ("Alpha", '\u{391}'), + ("Aring", '\u{c5}'), + ("Atilde", '\u{c3}'), + ("Auml", '\u{c4}'), + ("Beta", '\u{392}'), + ("Ccedil", '\u{c7}'), + ("Chi", '\u{3a7}'), + ("Dagger", '\u{2021}'), + ("Delta", '\u{394}'), + ("ETH", '\u{d0}'), + ("Eacute", '\u{c9}'), + ("Ecirc", '\u{ca}'), + ("Egrave", '\u{c8}'), + ("Epsilon", '\u{395}'), + ("Eta", '\u{397}'), + ("Euml", '\u{cb}'), + ("Gamma", '\u{393}'), + ("Iacute", '\u{cd}'), + ("Icirc", '\u{ce}'), + ("Igrave", '\u{cc}'), + ("Iota", '\u{399}'), + ("Iuml", '\u{cf}'), + ("Kappa", '\u{39a}'), + ("Lambda", '\u{39b}'), + ("Mu", '\u{39c}'), + ("Ntilde", '\u{d1}'), + ("Nu", '\u{39d}'), + ("OElig", '\u{152}'), + ("Oacute", '\u{d3}'), + ("Ocirc", '\u{d4}'), + ("Ograve", '\u{d2}'), + ("Omega", '\u{3a9}'), + ("Omicron", '\u{39f}'), + ("Oslash", '\u{d8}'), + ("Otilde", '\u{d5}'), + ("Ouml", '\u{d6}'), + ("Phi", '\u{3a6}'), + ("Pi", '\u{3a0}'), + ("Prime", '\u{2033}'), + ("Psi", '\u{3a8}'), + ("Rho", '\u{3a1}'), + ("Scaron", '\u{160}'), + ("Sigma", '\u{3a3}'), + ("THORN", '\u{de}'), + ("Tau", '\u{3a4}'), + ("Theta", '\u{398}'), + ("Uacute", '\u{da}'), + ("Ucirc", '\u{db}'), + ("Ugrave", '\u{d9}'), + ("Upsilon", '\u{3a5}'), + ("Uuml", '\u{dc}'), + ("Xi", '\u{39e}'), + ("Yacute", '\u{dd}'), + ("Yuml", '\u{178}'), + ("Zeta", '\u{396}'), + ("aacute", '\u{e1}'), + ("acirc", '\u{e2}'), + ("acute", '\u{b4}'), + ("aelig", '\u{e6}'), + ("agrave", '\u{e0}'), + ("alefsym", '\u{2135}'), + ("alpha", '\u{3b1}'), + ("amp", '\u{26}'), + ("and", '\u{2227}'), + ("ang", '\u{2220}'), + ("apos", '\u{27}'), + ("aring", '\u{e5}'), + ("asymp", '\u{2248}'), + ("atilde", '\u{e3}'), + ("auml", '\u{e4}'), + ("bdquo", '\u{201e}'), + ("beta", '\u{3b2}'), + ("brvbar", '\u{a6}'), + ("bull", '\u{2022}'), + ("cap", '\u{2229}'), + ("ccedil", '\u{e7}'), + ("cedil", '\u{b8}'), + ("cent", '\u{a2}'), + ("chi", '\u{3c7}'), + ("circ", '\u{2c6}'), + ("clubs", '\u{2663}'), + ("cong", '\u{2245}'), + ("copy", '\u{a9}'), + ("crarr", '\u{21b5}'), + ("cup", '\u{222a}'), + ("curren", '\u{a4}'), + ("dArr", '\u{21d3}'), + ("dagger", '\u{2020}'), + ("darr", '\u{2193}'), + ("deg", '\u{b0}'), + ("delta", '\u{3b4}'), + ("diams", '\u{2666}'), + ("divide", '\u{f7}'), + ("eacute", '\u{e9}'), + ("ecirc", '\u{ea}'), + ("egrave", '\u{e8}'), + ("empty", '\u{2205}'), + ("emsp", '\u{2003}'), + ("ensp", '\u{2002}'), + ("epsilon", '\u{3b5}'), + ("equiv", '\u{2261}'), + ("eta", '\u{3b7}'), + ("eth", '\u{f0}'), + ("euml", '\u{eb}'), + ("euro", '\u{20ac}'), + ("exist", '\u{2203}'), + ("fnof", '\u{192}'), + ("forall", '\u{2200}'), + ("frac12", '\u{bd}'), + ("frac14", '\u{bc}'), + ("frac34", '\u{be}'), + ("frasl", '\u{2044}'), + ("gamma", '\u{3b3}'), + ("ge", '\u{2265}'), + ("gt", '\u{3e}'), + ("hArr", '\u{21d4}'), + ("harr", '\u{2194}'), + ("hearts", '\u{2665}'), + ("hellip", '\u{2026}'), + ("iacute", '\u{ed}'), + ("icirc", '\u{ee}'), + ("iexcl", '\u{a1}'), + ("igrave", '\u{ec}'), + ("image", '\u{2111}'), + ("infin", '\u{221e}'), + ("int", '\u{222b}'), + ("iota", '\u{3b9}'), + ("iquest", '\u{bf}'), + ("isin", '\u{2208}'), + ("iuml", '\u{ef}'), + ("kappa", '\u{3ba}'), + ("lArr", '\u{21d0}'), + ("lambda", '\u{3bb}'), + ("lang", '\u{2329}'), + ("laquo", '\u{ab}'), + ("larr", '\u{2190}'), + ("lceil", '\u{2308}'), + ("ldquo", '\u{201c}'), + ("le", '\u{2264}'), + ("lfloor", '\u{230a}'), + ("lowast", '\u{2217}'), + ("loz", '\u{25ca}'), + ("lrm", '\u{200e}'), + ("lsaquo", '\u{2039}'), + ("lsquo", '\u{2018}'), + ("lt", '\u{3c}'), + ("macr", '\u{af}'), + ("mdash", '\u{2014}'), + ("micro", '\u{b5}'), + ("middot", '\u{b7}'), + ("minus", '\u{2212}'), + ("mu", '\u{3bc}'), + ("nabla", '\u{2207}'), + ("nbsp", '\u{a0}'), + ("ndash", '\u{2013}'), + ("ne", '\u{2260}'), + ("ni", '\u{220b}'), + ("not", '\u{ac}'), + ("notin", '\u{2209}'), + ("nsub", '\u{2284}'), + ("ntilde", '\u{f1}'), + ("nu", '\u{3bd}'), + ("oacute", '\u{f3}'), + ("ocirc", '\u{f4}'), + ("oelig", '\u{153}'), + ("ograve", '\u{f2}'), + ("oline", '\u{203e}'), + ("omega", '\u{3c9}'), + ("omicron", '\u{3bf}'), + ("oplus", '\u{2295}'), + ("or", '\u{2228}'), + ("ordf", '\u{aa}'), + ("ordm", '\u{ba}'), + ("oslash", '\u{f8}'), + ("otilde", '\u{f5}'), + ("otimes", '\u{2297}'), + ("ouml", '\u{f6}'), + ("para", '\u{b6}'), + ("part", '\u{2202}'), + ("permil", '\u{2030}'), + ("perp", '\u{22a5}'), + ("phi", '\u{3c6}'), + ("pi", '\u{3c0}'), + ("piv", '\u{3d6}'), + ("plusmn", '\u{b1}'), + ("pound", '\u{a3}'), + ("prime", '\u{2032}'), + ("prod", '\u{220f}'), + ("prop", '\u{221d}'), + ("psi", '\u{3c8}'), + ("quot", '\u{22}'), + ("rArr", '\u{21d2}'), + ("radic", '\u{221a}'), + ("rang", '\u{232a}'), + ("raquo", '\u{bb}'), + ("rarr", '\u{2192}'), + ("rceil", '\u{2309}'), + ("rdquo", '\u{201d}'), + ("real", '\u{211c}'), + ("reg", '\u{ae}'), + ("rfloor", '\u{230b}'), + ("rho", '\u{3c1}'), + ("rlm", '\u{200f}'), + ("rsaquo", '\u{203a}'), + ("rsquo", '\u{2019}'), + ("sbquo", '\u{201a}'), + ("scaron", '\u{161}'), + ("sdot", '\u{22c5}'), + ("sect", '\u{a7}'), + ("shy", '\u{ad}'), + ("sigma", '\u{3c3}'), + ("sigmaf", '\u{3c2}'), + ("sim", '\u{223c}'), + ("spades", '\u{2660}'), + ("sub", '\u{2282}'), + ("sube", '\u{2286}'), + ("sum", '\u{2211}'), + ("sup", '\u{2283}'), + ("sup1", '\u{b9}'), + ("sup2", '\u{b2}'), + ("sup3", '\u{b3}'), + ("supe", '\u{2287}'), + ("szlig", '\u{df}'), + ("tau", '\u{3c4}'), + ("there4", '\u{2234}'), + ("theta", '\u{3b8}'), + ("thetasym", '\u{3d1}'), + ("thinsp", '\u{2009}'), + ("thorn", '\u{fe}'), + ("tilde", '\u{2dc}'), + ("times", '\u{d7}'), + ("trade", '\u{2122}'), + ("uArr", '\u{21d1}'), + ("uacute", '\u{fa}'), + ("uarr", '\u{2191}'), + ("ucirc", '\u{fb}'), + ("ugrave", '\u{f9}'), + ("uml", '\u{a8}'), + ("upsih", '\u{3d2}'), + ("upsilon", '\u{3c5}'), + ("uuml", '\u{fc}'), + ("weierp", '\u{2118}'), + ("xi", '\u{3be}'), + ("yacute", '\u{fd}'), + ("yen", '\u{a5}'), + ("yuml", '\u{ff}'), + ("zeta", '\u{3b6}'), + ("zwj", '\u{200d}'), + ("zwnj", '\u{200c}'), +]; diff --git a/parse_wiki_text/src/lib.rs b/parse_wiki_text/src/lib.rs new file mode 100644 index 0000000..0eac2ea --- /dev/null +++ b/parse_wiki_text/src/lib.rs @@ -0,0 +1,604 @@ +// Copyright 2019 Fredrik Portström +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +//! Parse wiki text from Mediawiki into a tree of elements. +//! +//! # Introduction +//! +//! Wiki text is a format that follows the PHP maxim “Make everything as inconsistent and confusing as possible”. There are hundreds of millions of interesting documents written in this format, distributed under free licenses on sites that use the Mediawiki software, mainly Wikipedia and Wiktionary. Being able to parse wiki text and process these documents would allow access to a significant part of the world's knowledge. +//! +//! The Mediawiki software itself transforms a wiki text document into an HTML document in an outdated format to be displayed in a browser for a human reader. It does so through a [step by step procedure](https://www.mediawiki.org/wiki/Manual:Parser.php) of string substitutions, with some of the steps depending on the result of previous steps. [The main file for this procedure](https://doc.wikimedia.org/mediawiki-core/master/php/Parser_8php_source.html) has 6200 lines of code and the [second biggest file](https://doc.wikimedia.org/mediawiki-core/master/php/Preprocessor__DOM_8php_source.html) has 2000, and then there is a [1400 line file](https://doc.wikimedia.org/mediawiki-core/master/php/ParserOptions_8php_source.html) just to take options for the parser. +//! +//! What would be more interesting is to parse the wiki text document into a structure that can be used by a computer program to reason about the facts in the document and present them in different ways, making them available for a great variety of applications. +//! +//! Some people have tried to parse wiki text using regular expressions. This is incredibly naive and fails as soon as the wiki text is non-trivial. The capabilities of regular expressions don't come anywhere close to the complexity of the weirdness required to correctly parse wiki text. One project did a brave attempt to use a parser generator to parse wiki text. Wiki text was however never designed for formal parsers, so even parser generators are of no help in correctly parsing wiki text. +//! +//! Wiki text has a long history of poorly designed additions carelessly piled on top of each other. The syntax of wiki text is different in each wiki depending on its configuration. You can't even know what's a start tag until you see the corresponding end tag, and you can't know where the end tag is unless you parse the entire hierarchy of nested tags between the start tag and the end tag. In short: If you think you understand wiki text, you don't understand wiki text. +//! +//! Parse Wiki Text attempts to take all uncertainty out of parsing wiki text by converting it to another format that is easy to work with. The target format is Rust objects that can ergonomically be processed using iterators and match expressions. +//! +//! # Design goals +//! +//! ## Correctness +//! +//! Parse Wiki Text is designed to parse wiki text exactly as parsed by Mediawiki. Even when there is obviously a bug in Mediawiki, Parse Wiki Text replicates that exact bug. If there is something Parse Wiki Text doesn't parse exactly the same as Mediawiki, please report it as an issue. +//! +//! ## Speed +//! +//! Parse Wiki Text is designed to parse a page in as little time as possible. It parses tens of thousands of pages per second on each processor core and can quickly parse an entire wiki with millions of pages. If there is anything that can be changed to make Parse Wiki Text faster, please report it as an issue. +//! +//! ## Safety +//! +//! Parse Wiki Text is designed to work with untrusted inputs. If any input doesn't parse safely with reasonable resources, please report it as an issue. No unsafe code is used. +//! +//! ## Platform support +//! +//! Parse Wiki Text is designed to run in a wide variety of environments, such as: +//! +//! - servers running machine code +//! - browsers running Web Assembly +//! - embedded in other programming languages +//! +//! Parse Wiki Text can be deployed anywhere with no dependencies. +//! +//! # Caution +//! +//! Wiki text is a legacy format used by legacy software. Parse Wiki Text is intended only to recover information that has been written for wikis running legacy software, replicating the exact bugs found in the legacy software. Please don't use wiki text as a format for new applications. Wiki text is a horrible format with an astonishing amount of inconsistencies, bad design choices and bugs. For new applications, please use a format that is designed to be easy to process, such as JSON or even better [CBOR](http://cbor.io). See [Wikidata](https://www.wikidata.org/wiki/Wikidata:Main_Page) for an example of a wiki that uses JSON as its format and provides a rich interface for editing data instead of letting people write code. If you need to take information written in wiki text and reuse it in a new application, you can use Parse Wiki Text to convert it to an intermediate format that you can further process into a modern format. +//! +//! # Site configuration +//! +//! Wiki text has plenty of features that are parsed in a way that depends on the configuration of the wiki. This means the configuration must be known before parsing. +//! +//! - External links are parsed only when the scheme of the URI of the link is in the configured list of valid protocols. When the scheme is not valid, the link is parsed as plain text. +//! - Categories and images superficially look they same way as links, but are parsed differently. These can only be distinguished by knowing the namespace aliases from the configuration of the wiki. +//! - Text matching the configured set of magic words is parsed as magic words. +//! - Extension tags have the same syntax as HTML tags, but are parsed differently. The configuration tells which tag names are to be treated as extension tags. +//! +//! The configuration can be seen by making a request to the [site info](https://www.mediawiki.org/wiki/API:Siteinfo) resource on the wiki. The utility [Fetch site configuration](https://github.com/portstrom/fetch_site_configuration) fetches the parts of the configuration needed for parsing pages in the wiki, and outputs Rust code for instantiating a parser with that configuration. Parse Wiki Text contains a default configuration that can be used for testing. +//! +//! # Limitations +//! +//! Wiki text was never designed to be possible to parse into a structured format. It's designed to be parsed in multiple passes, where each pass depends on the output on the previous pass. Most importantly, templates are expanded in an earlier pass and formatting codes are parsed in a later pass. This means the formatting codes you see in the original text are not necessarily the same as the parser will see after templates have been expanded. Luckily this is as bad for human editors as it is for computers, so people tend to avoid writing templates that cause formatting codes to be parsed in a way that differs from what they would expect from reading the original wiki text before expanding templates. Parse Wiki Text assumes that templates never change the meaning of formatting codes around them. +//! +//! # Sandbox +//! +//! A sandbox ([Github](https://github.com/portstrom/parse_wiki_text_sandbox), [try online](https://portstrom.com/parse_wiki_text_sandbox/)) is available that allows interactively entering wiki text and inspecting the result of parsing it. +//! +//! # Comparison with Mediawiki Parser +//! +//! There is another crate called Mediawiki Parser ([crates.io](https://crates.io/crates/mediawiki_parser), [Github](https://github.com/vroland/mediawiki-parser)) that does basically the same thing, parsing wiki text to a tree of elements. That crate however doesn't take into account any of the astonishing amount of weirdness required to correctly parse wiki text. That crate admittedly only parses a subset of wiki text, with the intention to report errors for any text that is too weird to fit that subset, which is a good intention, but when examining it, that subset is quickly found to be too small to parse pages from actual wikis, and even worse, the error reporting is just an empty promise, and there's no indication when a text is incorrectly parsed. +//! +//! That crate could possibly be improved to always report errors when a text isn't in the supported subset, but pages found in real wikis very often don't conform to the small subset of wiki text that can be parsed without weirdness, so it still wouldn't be useful. Improving that crate to correctly parse a large enough subset of wiki text would be as much effort as starting over from scratch, which is why Parse Wiki Text was made without taking anything from Mediawiki Parser. Parse Wiki Text aims to correctly parse all wiki text, not just a subset, and report warnings when encountering weirdness that should be avoided. +//! +//! # Examples +//! +//! The default configuration is used for testing purposes only. +//! For parsing a real wiki you need a site-specific configuration. +//! Reuse the same configuration when parsing multiple pages for efficiency. +//! +//! ``` +//! use parse_wiki_text::{Configuration, Node}; +//! let wiki_text = concat!( +//! "==Our values==\n", +//! "*Correctness\n", +//! "*Speed\n", +//! "*Ergonomics" +//! ); +//! let result = Configuration::default().parse(wiki_text); +//! assert!(result.warnings.is_empty()); +//! # let mut found = false; +//! for node in result.nodes { +//! if let Node::UnorderedList { items, .. } = node { +//! println!("Our values are:"); +//! for item in items { +//! println!("- {}", item.nodes.iter().map(|node| match node { +//! Node::Text { value, .. } => value, +//! _ => "" +//! }).collect::()); +//! # found = true; +//! } +//! } +//! } +//! # assert!(found); +//! ``` + +#![forbid(unsafe_code)] +#![warn(missing_docs)] + +mod bold_italic; +mod case_folding_simple; +mod character_entity; +mod comment; +mod configuration; +mod default; +mod external_link; +mod heading; +mod html_entities; +mod line; +mod link; +mod list; +mod magic_word; +mod parse; +mod positioned; +mod redirect; +mod state; +mod table; +mod tag; +mod template; +mod trie; +mod warning; + +pub use configuration::ConfigurationSource; +use configuration::Namespace; +use state::{OpenNode, OpenNodeType, State}; +use std::{ + borrow::Cow, + collections::{HashMap, HashSet}, +}; +use trie::Trie; +pub use warning::{Warning, WarningMessage}; + +/// Configuration for the parser. +/// +/// A configuration to correctly parse a real wiki can be created with `Configuration::new`. A configuration for testing and quick and dirty prototyping can be created with `Default::default`. +pub struct Configuration { + character_entities: Trie, + link_trail_character_set: HashSet, + magic_words: Trie<()>, + namespaces: Trie, + protocols: Trie<()>, + redirect_magic_words: Trie<()>, + tag_name_map: HashMap, +} + +/// List item of a definition list. +#[derive(Debug)] +pub struct DefinitionListItem<'a> { + /// The byte position in the wiki text where the element ends. + pub end: usize, + + /// The content of the element. + pub nodes: Vec>, + + /// The byte position in the wiki text where the element starts. + pub start: usize, + + /// The type of list item. + pub type_: DefinitionListItemType, +} + +/// Identifier for the type of a definition list item. +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub enum DefinitionListItemType { + /// Parsed from the code `:`. + Details, + + /// Parsed from the code `;`. + Term, +} + +/// List item of an ordered list or unordered list. +#[derive(Debug)] +pub struct ListItem<'a> { + /// The byte position in the wiki text where the element ends. + pub end: usize, + + /// The content of the element. + pub nodes: Vec>, + + /// The byte position in the wiki text where the element starts. + pub start: usize, +} + +/// Parsed node. +#[derive(Debug)] +pub enum Node<'a> { + /// Toggle bold text. Parsed from the code `'''`. + Bold { + /// The byte position in the wiki text where the element ends. + end: usize, + + /// The byte position in the wiki text where the element starts. + start: usize, + }, + + /// Toggle bold and italic text. Parsed from the code `'''''`. + BoldItalic { + /// The byte position in the wiki text where the element ends. + end: usize, + + /// The byte position in the wiki text where the element starts. + start: usize, + }, + + /// Category. Parsed from code starting with `[[`, a category namespace and `:`. + Category { + /// The byte position in the wiki text where the element ends. + end: usize, + + /// Additional information for sorting entries on the category page, if any. + ordinal: Vec>, + + /// The byte position in the wiki text where the element starts. + start: usize, + + /// The category referred to. + target: &'a str, + }, + + /// Character entity. Parsed from code starting with `&` and ending with `;`. + CharacterEntity { + /// The character represented. + character: char, + + /// The byte position in the wiki text where the element ends. + end: usize, + + /// The byte position in the wiki text where the element starts. + start: usize, + }, + + /// Comment. Parsed from code starting with `