diff options
author | JJ | 2023-01-04 23:57:41 +0000 |
---|---|---|
committer | JJ | 2023-01-04 23:57:48 +0000 |
commit | a2e04ff18ad27be4dc1c66079941baaec79e003f (patch) | |
tree | 256201497d3c3ef3dba9031ee985d407b80b95a6 | |
parent | baf2f93b3002c2a0769bbd53f37d845c7717d95b (diff) |
Copy the last version of the parse_wiki_text crate in for development
30 files changed, 7738 insertions, 2 deletions
@@ -7,5 +7,8 @@ edition = "2021" [dependencies] bzip2 = "0.4.3" -parse_wiki_text = "0.1.5" -# peg = "0.8.1" + +[dependencies.parse_wiki_text] +version = "0.1.5" +path = "parse_wiki_text" + diff --git a/parse_wiki_text/Cargo.toml b/parse_wiki_text/Cargo.toml new file mode 100644 index 0000000..d40739b --- /dev/null +++ b/parse_wiki_text/Cargo.toml @@ -0,0 +1,9 @@ +[package] +authors = ["Fredrik Portström <https://portstrom.com>"] +description = "Parse wiki text from Mediawiki into a tree of elements" +edition = "2018" +license-file = "LICENSE" +name = "parse_wiki_text" +readme = "readme.md" +repository = "https://github.com/portstrom/parse_wiki_text" +version = "0.1.5" diff --git a/parse_wiki_text/LICENSE b/parse_wiki_text/LICENSE new file mode 100644 index 0000000..e445eee --- /dev/null +++ b/parse_wiki_text/LICENSE @@ -0,0 +1,5 @@ +Copyright 2019 Fredrik Portström <https://portstrom.com> + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/parse_wiki_text/examples/test/main.rs b/parse_wiki_text/examples/test/main.rs new file mode 100644 index 0000000..72872fc --- /dev/null +++ b/parse_wiki_text/examples/test/main.rs @@ -0,0 +1,51 @@ +// Copyright 2019 Fredrik Portström <https://portstrom.com> +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +extern crate parse_wiki_text; + +mod test; +mod test_cases; + +fn main() { + let mut args = std::env::args(); + match args.nth(1) { + None => return test::run_test(&Default::default()), + Some(command) => match &command as _ { + "file" => { + if let Some(path) = args.next() { + if args.next().is_none() { + match std::fs::read_to_string(path) { + Err(error) => { + eprintln!("Failed to read file: {}", error); + std::process::exit(1); + } + Ok(file_contents) => { + println!( + "{:#?}", + parse_wiki_text::Configuration::default().parse(&file_contents) + ); + return; + } + } + } + } + } + "text" => { + if let Some(wiki_text) = args.next() { + if args.next().is_none() { + println!( + "{:#?}", + parse_wiki_text::Configuration::default() + .parse(&wiki_text.replace("\\t", "\t").replace("\\n", "\n")) + ); + return; + } + } + } + _ => {} + }, + } + eprintln!("invalid use"); + std::process::exit(1); +} diff --git a/parse_wiki_text/examples/test/test.rs b/parse_wiki_text/examples/test/test.rs new file mode 100644 index 0000000..9f04942 --- /dev/null +++ b/parse_wiki_text/examples/test/test.rs @@ -0,0 +1,70 @@ +// Copyright 2019 Fredrik Portström <https://portstrom.com> +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +use crate::test_cases::TEST_CASES; + +pub fn run_test(configuration: &parse_wiki_text::Configuration) { + let mut output = concat!( + "<title>Parse Wiki Text test cases</title>", + "<style>", + "a{color:#006064;display:block;padding:8;text-decoration:none}", + "a:hover{background:#eee}", + "body{background:#f7f7f7;display:flex;font-family:sans-serif;height:100%;margin:0}", + "div div{background:#fff;box-shadow: 0 1px 3px rgba(0,0,0,.12),0 1px 2px rgba(0,0,0,.24);margin:16;padding:16}", + "h1{font-size:20;margin:24 16 16}", + "hr{border:0;border-top:1px solid #ccc}", + "pre{margin:0}", + "span{color:#aaa}", + "</style>", + "<div style=\"background:#fff;box-shadow: 0 1px 3px rgba(0,0,0,.12),0 1px 2px rgba(0,0,0,.24);flex:0 1 220px;overflow:auto\">" + ).to_owned(); + if let Some(window) = TEST_CASES + .windows(2) + .find(|window| window[0].0 >= window[1].0) + { + panic!("Sort: {:#?}", (window[0].0, window[1].0)); + } + for (title, test_cases) in TEST_CASES { + if let Some(window) = test_cases.windows(2).find(|window| window[0] >= window[1]) { + panic!("Sort: {:#?}", window); + } + output += &format!("<a href=#{}>", title.replace(" ", "_")); + output += title; + output += &format!(" <span>{}</span></a>", test_cases.len()); + } + output += "</div><div style=\"flex:1 1 200px;overflow:auto\">"; + for (title, test_cases) in TEST_CASES { + output += &format!("<h1 id={}>", title.replace(" ", "_")); + output += title; + output += "</h1>"; + for wiki_text in *test_cases { + output += "<div><pre>"; + output += &wiki_text + .replace("&", "&") + .replace("<", "<") + .replace("\t", "<span>⭾</span>") + .replace("\n", "<span>⏎</span>\n") + .replace(" ", "<span>·</span>") + .replace("</span><span>", ""); + match std::panic::catch_unwind(|| configuration.parse(wiki_text)) { + Err(_) => { + eprintln!("Panic with wiki text {:?}", wiki_text); + output += "</pre><hr>panic</div>"; + } + Ok(result) => { + output += "</pre><hr><pre>"; + output += &format!("{:#?}", result) + .replace("&", "&") + .replace("<", "<"); + output += "</pre></div>"; + } + } + } + } + output += "</div>"; + if let Err(error) = std::fs::write("report.html", output) { + eprintln!("Failed to write report: {}", error); + std::process::exit(1); + } +} diff --git a/parse_wiki_text/examples/test/test_cases.rs b/parse_wiki_text/examples/test/test_cases.rs new file mode 100644 index 0000000..da7b374 --- /dev/null +++ b/parse_wiki_text/examples/test/test_cases.rs @@ -0,0 +1,726 @@ +// Copyright 2019 Fredrik Portström <https://portstrom.com> +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +pub const TEST_CASES: &[(&str, &[&str])] = &[ + ( + "basic", + &[ + "", + "\t", + "\t\n", + "\t alpha", + "\talpha", + "\n", + "\n\t", + "\n\n\nalpha", + "\n\nalpha", + "\n\nalpha\n\n", + "\n \nalpha", + "\nalpha", + "\nalpha\n", + " ", + " \n\nalpha", + " \nalpha", + " ", + "!!", + "alpha", + "alpha\t", + "alpha\n", + "alpha\n\t", + "alpha\n\n", + "alpha\n\n\n", + "alpha\n\n ", + "alpha\n ", + "alpha\n \n", + "alpha\nbeta", + "alpha ", + "alpha \n", + ], + ), + ( + "bold italic", + &[ + "'", + "''", + "'''", + "''''", + "'''''", + "''''''", + "'''''''", + "''''''''", + "'''alpha", + "'''alpha''", + "'''alpha'''", + "''alpha", + "''alpha''", + "''alpha'''", + "alpha''", + "alpha'''", + "alpha'''beta", + "alpha'''beta'''gamma", + "alpha'''beta''gamma", + "alpha''beta", + "alpha''beta'''gamma", + "alpha''beta''gamma", + ], + ), + ( + "character entity", + &[ + "≪", + "Ö", + "<", + "<ö", + "<", + "< alpha", + "<ö", + "<alpha", + "ö", + "alpha <", + "alpha < beta", + "alpha<", + "alpha<beta", + ], + ), + ( + "comment", + &[ + "<!--", + "<!---->", + "<!---->beta", + "<!--->beta", + "<!--<!--alpha-->-->beta", + "<!--alpha-->", + "<!--alpha--> beta", + "<!--alpha--><!--beta", + "<!--alpha--><!--beta-->", + "<!--alpha-->beta", + "<!-<!--alpha-->beta", + "alpha <!--beta", + "alpha<!--beta", + ], + ), + ( + "external link", + &[ + "[//alpha", + "[//alpha beta\ngamma]", + "[//alpha beta]", + "[//alpha]", + "[//alpha] beta", + "[//alpha]beta", + "[HTTP://alpha]", + "[Http://alpha]", + "[alpha://beta]", + "[hTtP://alpha]", + "[http://alpha]", + "[http:/alpha]", + "[http:alpha]", + "[https://alpha]", + "[sip:alpha]", + "alpha [//beta]", + "alpha [//beta] gamma", + "alpha[//beta]", + "alpha[//beta]gamma", + ], + ), + ( + "heading", + &[ + "=", + "= =", + "= alpha =", + "=''=", + "==", + "== ''=", + "== alpha''=", + "==''=", + "===", + "====", + "=====", + "======", + "========alpha========", + "=======alpha======", + "=======alpha=======", + "======alpha======", + "=====alpha=====", + "====alpha====", + "===alpha===", + "==alpha''=", + "==alpha=", + "==alpha==", + "=alpha", + "=alpha\nbeta=", + "=alpha=", + "=alpha=\n\n\nbeta", + "=alpha=\n\n=beta=", + "=alpha=\n\nbeta", + "=alpha=\n=beta=", + "=alpha=\nbeta", + "=alpha= \nbeta", + "=alpha==", + "alpha\t\n=beta=", + "alpha\n\n=beta=", + "alpha\n\n=beta=\n\ngamma", + "alpha\n=beta=", + "alpha\n=beta=\ngamma", + "alpha \n=beta=", + ], + ), + ( + "horizontal divider", + &[ + "----", + "----\t\nalpha", + "----\n\n\n----", + "----\n\n\nalpha", + "----\n\n----", + "----\n\nalpha", + "----\n----", + "----\nalpha", + "---- \nalpha", + "-----", + "------", + "----alpha", + "alpha\t\n----", + "alpha\n\n\n----", + "alpha\n\n----", + "alpha\n \n----", + "alpha\n----", + "alpha \n----", + ], + ), + ("invalid character", &["\0", "\r", "\x7f"]), + ( + "link", + &[ + "[[FILE:alpha]]", + "[[File:alpha]]", + "[[alpha", + "[[alpha:beta]]", + "[[alpha:beta]]gamma", + "[[alpha]]", + "[[alpha]] beta", + "[[alpha]]beta", + "[[alpha]]beta gamma", + "[[alpha]]ü", + "[[alpha|", + "[[alpha|[beta]gamma]]", + "[[alpha|]]", + "[[alpha|beta", + "[[alpha|beta\ngamma]]", + "[[alpha|beta[[gamma]]]]", + "[[alpha|beta]]", + "[[alpha|beta]]gamma", + "[[category:alpha]]", + "[[category:alpha]]beta", + "[[category:alpha|beta]]", + "[[file:alpha]]", + "[[file:alpha]]beta", + "[[file:alpha|[[beta]]]]", + "[[file:alpha|[[beta]]gamma]]", + "[[file:alpha|]]", + "[[file:alpha|beta[[gamma]]]]", + "[[file:alpha|beta]]", + "[[file:alpha|beta]]gamma", + "[[image:alpha]]", + "[[|]]", + "[[|alpha]]", + "alpha [[beta]]", + "alpha[[beta]]", + "alpha[[beta]]gamma", + ], + ), + ( + "list", + &[ + "#", + "#\n\n\nalpha", + "#\n\nalpha", + "#\n#", + "#\n##", + "#\n##\n#", + "#\n*", + "#\n:", + "#\n;", + "#\nalpha", + "# alpha", + "##", + "##\n#", + "##\n#\n##", + "##\n##", + "#=alpha=", + "#alpha", + "#alpha\n#beta", + "*", + "*\n\nalpha", + "*\n#", + "*\n*", + "*\n**", + "*\n**\n*", + "*\n:", + "*\n;", + "*\nalpha", + "* alpha", + "* alpha\n* beta", + "**", + "**\n*", + "**\n*\n**", + "**\n**", + "*;\n*;", + "*;\n*;*", + "*;*\n*;", + "*;*\n*;#", + "*=alpha=", + "*alpha", + "*alpha\n*beta", + ":", + ":\n\nalpha", + ":\n#", + ":\n*", + ":\n:", + ":\n::", + ":\n::\n:", + ":\n;", + ":\nalpha", + ": alpha", + "::", + "::\n:", + "::\n:\n::", + "::\n::", + ":=alpha=", + ":alpha", + ":alpha\nbeta", + ";", + ";\n\nalpha", + ";\n#", + ";\n*", + ";\n:", + ";\n;;", + ";\n;;\n;", + ";\nalpha", + "; alpha", + ";;", + ";;\n;", + ";;\n;\n;;", + ";;\n;;", + ";=alpha=", + ";alpha", + ";alpha\nbeta", + "alpha\t\n#", + "alpha\n#", + "alpha\n#\nbeta", + "alpha\n*", + "alpha\n*\nbeta", + "alpha\n:", + "alpha\n:\nbeta", + "alpha\n;", + "alpha\n;\nbeta", + "alpha \n#", + ], + ), + ( + "magic word", + &[ + "__ALPHA__", + "__NOTC__ __TOC__", + "__NOTC___TOC__", + "__NOTC____TOC__", + "__TOC_", + "__TOC__", + "__TOC__ alpha", + "__TOC__alpha", + "__ToC__", + "__tOc__", + "__toc__", + "alpha __TOC__", + "alpha __TOC__ beta", + "alpha__TOC__", + "alpha__TOC__beta", + ], + ), + ( + "mix", + &[ + " alpha\n {|\n beta\n |}\n gamma", + " alpha\n {|\n|}", + " alpha\n |}", + " alpha\n |}\n beta", + " {|\n alpha\n |}", + " {|\n alpha\n|}", + "*\n alpha\n*", + "----\t\n*", + "----\n\n*", + "----\n*", + "----\n*\nalpha", + "---- \n*", + "<ref><!--", + "=alpha=\n\n----", + "=alpha=\n----", + "{{alpha|<!--", + "{|\n alpha\n |}", + "{|\n alpha\n|}", + "{|\n|}\t\n*", + "{|\n|}\n*", + "{|\n|}\n*\nalpha", + "{|\n|} \n*", + ], + ), + ( + "nowiki", + &[ + "<MATH>''</MATH>", + "<NOWIKI>''</NOWIKI>", + "<mAtH>''</MaTh>", + "<math>''</math>", + "<math>''alpha", + "<nOwIkI>''</NoWiKi>", + "<nowiki>\n*alpha\n</nowiki>", + "<nowiki>\n=alpha=\n</nowiki>", + "<nowiki>''</nowiki>", + "<nowiki>''alpha", + "<nowiki><!-- alpha --></nowiki>", + "<nowiki>{{</nowiki>", + "<nowiki>{{alpha}}</nowiki>", + "<nowiki>}}</nowiki>", + ], + ), + ( + "paragraph break", + &[ + "alpha\t\n\nbeta", + "alpha\n\t\nbeta", + "alpha\n\n\t beta", + "alpha\n\n\tbeta", + "alpha\n\n\n\nbeta", + "alpha\n\n\nbeta", + "alpha\n\nbeta", + "alpha\n \nbeta", + "alpha \n\nbeta", + ], + ), + ( + "parameter", + &[ + "*alpha}}}", + "[[alpha|beta}}}]]", + "{{{", + "{{{\talpha}}}", + "{{{\nalpha}}}", + "{{{''}}}", + "{{{[[alpha|beta}}}", + "{{{alpha\t|beta}}}", + "{{{alpha\t}}}", + "{{{alpha\n|beta}}}", + "{{{alpha\n}}}", + "{{{alpha |beta}}}", + "{{{alpha }}}", + "{{{alpha|", + "{{{alpha|\tbeta}}}", + "{{{alpha|\t|}}}", + "{{{alpha|\t}}}", + "{{{alpha|\nbeta}}}", + "{{{alpha|\n|}}}", + "{{{alpha|\n}}}", + "{{{alpha| beta|}}}", + "{{{alpha| |}}}", + "{{{alpha| }}}", + "{{{alpha|beta\t|}}}", + "{{{alpha|beta\n|}}}", + "{{{alpha|beta |}}}", + "{{{alpha|beta|", + "{{{alpha|beta|\n}}}", + "{{{alpha|beta|gamma}}}", + "{{{alpha|beta|}}}", + "{{{alpha|beta}}}", + "{{{alpha|}}}", + "{{{alpha}}}", + "{{{|''}}}", + "{{{||}}}", + "{{{|}}}", + "{{{}}}", + "}}}", + ], + ), + ( + "preformatted block", + &[ + " alpha", + " alpha", + " alpha\n\n\nbeta", + " alpha\n\nbeta", + " alpha\n beta", + " alpha\n beta\n gamma", + " alpha\n beta\ngamma", + " alpha\nbeta", + " alpha\nbeta\n gamma", + "alpha\t\n beta", + "alpha\n\n beta", + "alpha\n \n beta", + "alpha\n =beta=\ngamma", + "alpha\n beta", + "alpha\n beta\n gamma", + "alpha\n beta\ngamma", + "alpha \n beta", + ], + ), + ( + "redirect", + &[ + "\t#REDIRECT[[alpha]]", + "\n\n#REDIRECT[[alpha]]", + "\n #REDIRECT[[alpha]]", + "\n#REDIRECT [[alpha]]", + " \n#REDIRECT[[alpha]]", + " #REDIRECT[[alpha]]", + " #REDIRECT[[alpha]]", + "#REDIRECT\t:[[alpha]]", + "#REDIRECT\t[[alpha]]", + "#REDIRECT\n\n[[alpha]]", + "#REDIRECT\n [[alpha]]", + "#REDIRECT\n:\n[[alpha]]", + "#REDIRECT\n:[[alpha]]", + "#REDIRECT\n[[alpha]]", + "#REDIRECT \n[[alpha]]", + "#REDIRECT [[alpha]]", + "#REDIRECT : [[alpha]]", + "#REDIRECT :[[alpha]]", + "#REDIRECT [[alpha]]", + "#REDIRECT:\t[[alpha]]", + "#REDIRECT:\n[[alpha]]", + "#REDIRECT: [[alpha]]", + "#REDIRECT:[[alpha]]", + "#REDIRECT[[alpha]]", + "#REDIRECT[[alpha]]\n\nbeta", + "#REDIRECT[[alpha]]\n beta", + "#REDIRECT[[alpha]]\nbeta", + "#REDIRECT[[alpha]] \nbeta", + "#REDIRECT[[alpha]] beta", + "#REDIRECT[[alpha]] beta", + "#REDIRECT[[alpha]]''beta", + "#REDIRECT[[alpha]]beta", + "#REDIRECT[[alpha|]]", + "#REDIRECT[[alpha|]]beta", + "#REDIRECT[[alpha|beta\ngamma]]", + "#REDIRECT[[alpha|beta]]", + "#REDIRECT[[alpha|beta]]=gamma=", + "#REDIRECT[[alpha|beta]]gamma", + "#ReDiReCt[[alpha]]", + "#rEdIrEcT[[alpha]]", + "#redirect[[alpha]]", + ], + ), + ( + "table", + &[ + " {|\n |}", + " {|\n|}", + "alpha\n{|\nbeta\n|}", + "{|", + "{|\n |}", + "{|\n!\n alpha\n|}", + "{|\n!\n!\n|}", + "{|\n!\nalpha\n\nbeta\n|}", + "{|\n!\nalpha\n\n|}", + "{|\n!\nalpha\nbeta\n|}", + "{|\n!\nalpha \n|}", + "{|\n!\n|\n|}", + "{|\n!\n|-\n|}", + "{|\n!\n|}", + "{|\n! alpha\n|}", + "{|\n!!\n|}", + "{|\n!!!\n|}", + "{|\n!!!!\n|}", + "{|\n!!!|\n|}", + "{|\n!alpha\n\nbeta\n|}", + "{|\n!alpha\nbeta\n|}", + "{|\n!alpha\nbeta|gamma\n|}", + "{|\n!alpha\n|}", + "{|\n!alpha!!beta\n|}", + "{|\n!alpha!beta\n|}", + "{|\n!alpha|beta\n|}", + "{|\n!alpha||beta\n|}", + "{|\n!|\n|}", + "{|\n!|!!\n|}", + "{|\n!|alpha\n|}", + "{|\n!|alpha|beta\n|}", + "{|\n!||\n|}", + "{|\n!||alpha\n|}", + "{|\n!|||\n|}", + "{|\n*alpha\n|}", + "{|\n=alpha=\n|}", + "{|\nalpha\n|}", + "{|\n|", + "{|\n|\n alpha\n|}", + "{|\n|\n!\n|}", + "{|\n|\n*alpha\n|}", + "{|\n|\n=alpha=\n|}", + "{|\n|\nalpha\n\nbeta\n|}", + "{|\n|\nalpha\n\n|}", + "{|\n|\nalpha\nbeta\n|}", + "{|\n|\nalpha \n|}", + "{|\n|\n|\n|}", + "{|\n|\n|-\n|}", + "{|\n|\n|}", + "{|\n| alpha\n|}", + "{|\n|+\n alpha\n|}", + "{|\n|+\n*alpha\n|}", + "{|\n|+\n=alpha=\n|}", + "{|\n|+\nalpha\n\nbeta\n|}", + "{|\n|+\nalpha\nbeta\n|}", + "{|\n|+\nalpha\n|}", + "{|\n|+\n|+\n|}", + "{|\n|+\n|}", + "{|\n|+ alpha\n|}", + "{|\n|+!!\n|}", + "{|\n|+alpha\n\nbeta\n|}", + "{|\n|+alpha\nbeta\n|}", + "{|\n|+alpha\n|}", + "{|\n|+alpha \n|}", + "{|\n|+|\n|}", + "{|\n|+|alpha|\n|}", + "{|\n|+|alpha|beta\n|}", + "{|\n|+||\n|}", + "{|\n|+||alpha\n|}", + "{|\n|+|||\n|}", + "{|\n|-\n alpha\n|}", + "{|\n|-\n!\n|}", + "{|\n|-\n*alpha\n|}", + "{|\n|-\n=alpha=\n|}", + "{|\n|-\nalpha\n|}", + "{|\n|-\n|\n|}", + "{|\n|-\n|-\n|}", + "{|\n|-\n|}", + "{|\n|- alpha\n|}", + "{|\n|-alpha\n\n|}", + "{|\n|-alpha\n|}", + "{|\n|-alpha \n|}", + "{|\n|alpha\n\nbeta\n|}", + "{|\n|alpha\nbeta\n|}", + "{|\n|alpha\nbeta|gamma\n|}", + "{|\n|alpha\n|}", + "{|\n|alpha!!beta\n|}", + "{|\n|alpha!beta\n|}", + "{|\n|alpha|\n|}", + "{|\n|alpha|beta\n|}", + "{|\n|alpha||beta\n|}", + "{|\n||\n|}", + "{|\n||alpha\n|}", + "{|\n|||\n|}", + "{|\n||||\n|}", + "{|\n|}", + "{|\n|}\t\nalpha", + "{|\n|}\n\n\nalpha", + "{|\n|}\n\nalpha", + "{|\n|}\nalpha", + "{|\n|} \nalpha", + "{|\n|}alpha", + "{|alpha\nbeta\n|}", + "{|alpha\n|}", + ], + ), + ( + "tag", + &[ + "</BR>", + "</Br>", + "</alpha", + "</alpha>", + "</b", + "</b alpha>", + "</b alpha>beta", + "</b</b>", + "</b<b>", + "</b>", + "</b> alpha", + "</b>alpha", + "</br\t>", + "</br\n>", + "</br >", + "</br>", + "</ref", + "<BR>", + "<Br>", + "<alpha", + "<alpha>", + "<b", + "<b alpha>", + "<b alpha>beta", + "<b</b>", + "<b<b>", + "<b>", + "<b> alpha", + "<b>alpha", + "<br\t>", + "<br\n>", + "<br >", + "<br>", + "<r<ref>alpha</ref>beta", + "<ref", + "<ref />", + "<ref >", + "<ref/>", + "<ref>", + "<ref>\talpha</ref>", + "<ref>\nalpha</ref>", + "<ref> alpha</ref>", + "<ref></ref>", + "<ref>alpha\t</ref>", + "<ref>alpha\n</ref>", + "<ref>alpha </ref>", + "<ref>alpha</ref>", + "alpha<b>", + ], + ), + ( + "template", + &[ + "*alpha}}", + "[[alpha|beta}}]]", + "alpha {{beta}}", + "alpha {{beta}} gamma", + "alpha{{beta}}", + "alpha{{beta}}gamma", + "{{\nalpha}}", + "{{''}}", + "{{[[alpha|beta}}", + "{{alpha", + "{{alpha\n|beta}}", + "{{alpha\n|}}", + "{{alpha\n}}", + "{{alpha|", + "{{alpha|\nbeta}}", + "{{alpha|\n}}", + "{{alpha| beta}}", + "{{alpha|''}}", + "{{alpha|beta", + "{{alpha|beta\n=gamma}}", + "{{alpha|beta\n}}", + "{{alpha|beta =gamma}}", + "{{alpha|beta }}", + "{{alpha|beta=\ngamma}}", + "{{alpha|beta= gamma}}", + "{{alpha|beta=gamma\n}}", + "{{alpha|beta=gamma }}", + "{{alpha|beta=gamma=delta}}", + "{{alpha|beta=gamma|delta=epsilon}}", + "{{alpha|beta=gamma|delta}}", + "{{alpha|beta=gamma}}", + "{{alpha|beta=}}", + "{{alpha|beta|gamma=delta}}", + "{{alpha|beta|gamma}}", + "{{alpha|beta}", + "{{alpha|beta}}", + "{{alpha|beta}} gamma", + "{{alpha|beta}}gamma", + "{{alpha|}", + "{{alpha|}}", + "{{alpha}", + "{{alpha}}", + "{{alpha}} beta", + "{{alpha}}beta", + "}}", + ], + ), +]; diff --git a/parse_wiki_text/readme.md b/parse_wiki_text/readme.md new file mode 100644 index 0000000..b6de3bc --- /dev/null +++ b/parse_wiki_text/readme.md @@ -0,0 +1,107 @@ +<!-- +Copyright 2019 Fredrik Portström <https://portstrom.com> +This is free software distributed under the terms specified in +the file LICENSE at the top-level directory of this distribution. +--> + +# Parse Wiki Text + +Parse wiki text from Mediawiki into a tree of elements. + +![Parse Wiki Text](https://portstrom.com/parse_wiki_text.svg) + +## Introduction + +Wiki text is a format that follows the PHP maxim “Make everything as inconsistent and confusing as possible”. There are hundreds of millions of interesting documents written in this format, distributed under free licenses on sites that use the Mediawiki software, mainly Wikipedia and Wiktionary. Being able to parse wiki text and process these documents would allow access to a significant part of the world's knowledge. + +The Mediawiki software itself transforms a wiki text document into an HTML document in an outdated format to be displayed in a browser for a human reader. It does so through a [step by step procedure](https://www.mediawiki.org/wiki/Manual:Parser.php) of string substitutions, with some of the steps depending on the result of previous steps. [The main file for this procedure](https://doc.wikimedia.org/mediawiki-core/master/php/Parser_8php_source.html) has 6200 lines of code and the [second biggest file](https://doc.wikimedia.org/mediawiki-core/master/php/Preprocessor__DOM_8php_source.html) has 2000, and then there is a [1400 line file](https://doc.wikimedia.org/mediawiki-core/master/php/ParserOptions_8php_source.html) just to take options for the parser. + +What would be more interesting is to parse the wiki text document into a structure that can be used by a computer program to reason about the facts in the document and present them in different ways, making them available for a great variety of applications. + +Some people have tried to parse wiki text using regular expressions. This is incredibly naive and fails as soon as the wiki text is non-trivial. The capabilities of regular expressions don't come anywhere close to the complexity of the weirdness required to correctly parse wiki text. One project did a brave attempt to use a parser generator to parse wiki text. Wiki text was however never designed for formal parsers, so even parser generators are of no help in correctly parsing wiki text. + +Wiki text has a long history of poorly designed additions carelessly piled on top of each other. The syntax of wiki text is different in each wiki depending on its configuration. You can't even know what's a start tag until you see the corresponding end tag, and you can't know where the end tag is unless you parse the entire hierarchy of nested tags between the start tag and the end tag. In short: If you think you understand wiki text, you don't understand wiki text. + +Parse Wiki Text attempts to take all uncertainty out of parsing wiki text by converting it to another format that is easy to work with. The target format is Rust objects that can ergonomically be processed using iterators and match expressions. + +## Design goals + +### Correctness + +Parse Wiki Text is designed to parse wiki text exactly as parsed by Mediawiki. Even when there is obviously a bug in Mediawiki, Parse Wiki Text replicates that exact bug. If there is something Parse Wiki Text doesn't parse exactly the same as Mediawiki, please report it as an issue. + +### Speed + +Parse Wiki Text is designed to parse a page in as little time as possible. It parses tens of thousands of pages per second on each processor core and can quickly parse an entire wiki with millions of pages. If there is anything that can be changed to make Parse Wiki Text faster, please report it as an issue. + +### Safety + +Parse Wiki Text is designed to work with untrusted inputs. If any input doesn't parse safely with reasonable resources, please report it as an issue. No unsafe code is used. + +### Platform support + +Parse Wiki Text is designed to run in a wide variety of environments, such as: + +- servers running machine code +- browsers running Web Assembly +- embedded in other programming languages + +Parse Wiki Text can be deployed anywhere with no dependencies. + +## Caution + +Wiki text is a legacy format used by legacy software. Parse Wiki Text is intended only to recover information that has been written for wikis running legacy software, replicating the exact bugs found in the legacy software. Please don't use wiki text as a format for new applications. Wiki text is a horrible format with an astonishing amount of inconsistencies, bad design choices and bugs. For new applications, please use a format that is designed to be easy to process, such as JSON or even better [CBOR](http://cbor.io). See [Wikidata](https://www.wikidata.org/wiki/Wikidata:Main_Page) for an example of a wiki that uses JSON as its format and provides a rich interface for editing data instead of letting people write code. If you need to take information written in wiki text and reuse it in a new application, you can use Parse Wiki Text to convert it to an intermediate format that you can further process into a modern format. + +## Site configuration + +Wiki text has plenty of features that are parsed in a way that depends on the configuration of the wiki. This means the configuration must be known before parsing. + +- External links are parsed only when the scheme of the URI of the link is in the configured list of valid protocols. When the scheme is not valid, the link is parsed as plain text. +- Categories and images superficially look they same way as links, but are parsed differently. These can only be distinguished by knowing the namespace aliases from the configuration of the wiki. +- Text matching the configured set of magic words is parsed as magic words. +- Extension tags have the same syntax as HTML tags, but are parsed differently. The configuration tells which tag names are to be treated as extension tags. + +The configuration can be seen by making a request to the [site info](https://www.mediawiki.org/wiki/API:Siteinfo) resource on the wiki. The utility [Fetch site configuration](https://github.com/portstrom/fetch_mediawiki_configuration) fetches the parts of the configuration needed for parsing pages in the wiki, and outputs Rust code for instantiating a parser with that configuration. Parse Wiki Text contains a default configuration that can be used for testing. + +## Limitations + +Wiki text was never designed to be possible to parse into a structured format. It's designed to be parsed in multiple passes, where each pass depends on the output on the previous pass. Most importantly, templates are expanded in an earlier pass and formatting codes are parsed in a later pass. This means the formatting codes you see in the original text are not necessarily the same as the parser will see after templates have been expanded. Luckily this is as bad for human editors as it is for computers, so people tend to avoid writing templates that cause formatting codes to be parsed in a way that differs from what they would expect from reading the original wiki text before expanding templates. Parse Wiki Text assumes that templates never change the meaning of formatting codes around them. + +## Sandbox + +A sandbox ([Github](https://github.com/portstrom/parse_wiki_text_sandbox), [try online](https://portstrom.com/parse_wiki_text_sandbox/)) is available that allows interactively entering wiki text and inspecting the result of parsing it. + +## Comparison with Mediawiki Parser + +There is another crate called Mediawiki Parser ([crates.io](https://crates.io/crates/mediawiki_parser), [Github](https://github.com/vroland/mediawiki-parser)) that does basically the same thing, parsing wiki text to a tree of elements. That crate however doesn't take into account any of the astonishing amount of weirdness required to correctly parse wiki text. That crate admittedly only parses a subset of wiki text, with the intention to report errors for any text that is too weird to fit that subset, which is a good intention, but when examining it, that subset is quickly found to be too small to parse pages from actual wikis, and even worse, the error reporting is just an empty promise, and there's no indication when a text is incorrectly parsed. + +That crate could possibly be improved to always report errors when a text isn't in the supported subset, but pages found in real wikis very often don't conform to the small subset of wiki text that can be parsed without weirdness, so it still wouldn't be useful. Improving that crate to correctly parse a large enough subset of wiki text would be as much effort as starting over from scratch, which is why Parse Wiki Text was made without taking anything from Mediawiki Parser. Parse Wiki Text aims to correctly parse all wiki text, not just a subset, and report warnings when encountering weirdness that should be avoided. + +## Examples + +The default configuration is used for testing purposes only. +For parsing a real wiki you need a site-specific configuration. +Reuse the same configuration when parsing multiple pages for efficiency. + +```rust +use parse_wiki_text::{Configuration, Node}; +let wiki_text = concat!( + "==Our values==\n", + "*Correctness\n", + "*Speed\n", + "*Ergonomics" +); +let result = Configuration::default().parse(wiki_text); +assert!(result.warnings.is_empty()); +for node in result.nodes { + if let Node::UnorderedList { items, .. } = node { + println!("Our values are:"); + for item in items { + println!("- {}", item.nodes.iter().map(|node| match node { + Node::Text { value, .. } => value, + _ => "" + }).collect::<String>()); + } + } +} +``` diff --git a/parse_wiki_text/src/bold_italic.rs b/parse_wiki_text/src/bold_italic.rs new file mode 100644 index 0000000..e5ac613 --- /dev/null +++ b/parse_wiki_text/src/bold_italic.rs @@ -0,0 +1,33 @@ +// Copyright 2019 Fredrik Portström <https://portstrom.com> +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +pub fn parse_bold_italic(state: &mut crate::State) { + let scan_position = state.scan_position; + state.flush(scan_position); + let start_position = state.scan_position; + state.scan_position += 2; + while state.get_byte(state.scan_position) == Some(b'\'') { + state.scan_position += 1; + } + let length = state.scan_position - start_position; + if length < 3 { + state.flushed_position = state.scan_position; + state.nodes.push(crate::Node::Italic { + end: state.flushed_position, + start: start_position, + }); + } else if length < 5 { + state.flushed_position = start_position + 3; + state.nodes.push(crate::Node::Bold { + end: state.flushed_position, + start: start_position, + }); + } else { + state.flushed_position = start_position + 5; + state.nodes.push(crate::Node::BoldItalic { + end: state.flushed_position, + start: start_position, + }); + } +} diff --git a/parse_wiki_text/src/case_folding_simple.rs b/parse_wiki_text/src/case_folding_simple.rs new file mode 100644 index 0000000..3bd48c9 --- /dev/null +++ b/parse_wiki_text/src/case_folding_simple.rs @@ -0,0 +1,2632 @@ +// Copyright 2019 Fredrik Portström <https://portstrom.com> +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate case-folding-simple /home/andrew/tmp/ucd-10.0.0/ --chars --all-pairs +// +// ucd-generate is available on crates.io. + +pub const CASE_FOLDING_SIMPLE: &[(char, &[char])] = &[ + ('A', &['a']), + ('B', &['b']), + ('C', &['c']), + ('D', &['d']), + ('E', &['e']), + ('F', &['f']), + ('G', &['g']), + ('H', &['h']), + ('I', &['i']), + ('J', &['j']), + ('K', &['k', 'K']), + ('L', &['l']), + ('M', &['m']), + ('N', &['n']), + ('O', &['o']), + ('P', &['p']), + ('Q', &['q']), + ('R', &['r']), + ('S', &['s', 'ſ']), + ('T', &['t']), + ('U', &['u']), + ('V', &['v']), + ('W', &['w']), + ('X', &['x']), + ('Y', &['y']), + ('Z', &['z']), + ('a', &['A']), + ('b', &['B']), + ('c', &['C']), + ('d', &['D']), + ('e', &['E']), + ('f', &['F']), + ('g', &['G']), + ('h', &['H']), + ('i', &['I']), + ('j', &['J']), + ('k', &['K', 'K']), + ('l', &['L']), + ('m', &['M']), + ('n', &['N']), + ('o', &['O']), + ('p', &['P']), + ('q', &['Q']), + ('r', &['R']), + ('s', &['S', 'ſ']), + ('t', &['T']), + ('u', &['U']), + ('v', &['V']), + ('w', &['W']), + ('x', &['X']), + ('y', &['Y']), + ('z', &['Z']), + ('µ', &['Μ', 'μ']), + ('À', &['à']), + ('Á', &['á']), + ('Â', &['â']), + ('Ã', &['ã']), + ('Ä', &['ä']), + ('Å', &['å', 'Å']), + ('Æ', &['æ']), + ('Ç', &['ç']), + ('È', &['è']), + ('É', &['é']), + ('Ê', &['ê']), + ('Ë', &['ë']), + ('Ì', &['ì']), + ('Í', &['í']), + ('Î', &['î']), + ('Ï', &['ï']), + ('Ð', &['ð']), + ('Ñ', &['ñ']), + ('Ò', &['ò']), + ('Ó', &['ó']), + ('Ô', &['ô']), + ('Õ', &['õ']), + ('Ö', &['ö']), + ('Ø', &['ø']), + ('Ù', &['ù']), + ('Ú', &['ú']), + ('Û', &['û']), + ('Ü', &['ü']), + ('Ý', &['ý']), + ('Þ', &['þ']), + ('ß', &['ẞ']), + ('à', &['À']), + ('á', &['Á']), + ('â', &['Â']), + ('ã', &['Ã']), + ('ä', &['Ä']), + ('å', &['Å', 'Å']), + ('æ', &['Æ']), + ('ç', &['Ç']), + ('è', &['È']), + ('é', &['É']), + ('ê', &['Ê']), + ('ë', &['Ë']), + ('ì', &['Ì']), + ('í', &['Í']), + ('î', &['Î']), + ('ï', &['Ï']), + ('ð', &['Ð']), + ('ñ', &['Ñ']), + ('ò', &['Ò']), + ('ó', &['Ó']), + ('ô', &['Ô']), + ('õ', &['Õ']), + ('ö', &['Ö']), + ('ø', &['Ø']), + ('ù', &['Ù']), + ('ú', &['Ú']), + ('û', &['Û']), + ('ü', &['Ü']), + ('ý', &['Ý']), + ('þ', &['Þ']), + ('ÿ', &['Ÿ']), + ('Ā', &['ā']), + ('ā', &['Ā']), + ('Ă', &['ă']), + ('ă', &['Ă']), + ('Ą', &['ą']), + ('ą', &['Ą']), + ('Ć', &['ć']), + ('ć', &['Ć']), + ('Ĉ', &['ĉ']), + ('ĉ', &['Ĉ']), + ('Ċ', &['ċ']), + ('ċ', &['Ċ']), + ('Č', &['č']), + ('č', &['Č']), + ('Ď', &['ď']), + ('ď', &['Ď']), + ('Đ', &['đ']), + ('đ', &['Đ']), + ('Ē', &['ē']), + ('ē', &['Ē']), + ('Ĕ', &['ĕ']), + ('ĕ', &['Ĕ']), + ('Ė', &['ė']), + ('ė', &['Ė']), + ('Ę', &['ę']), + ('ę', &['Ę']), + ('Ě', &['ě']), + ('ě', &['Ě']), + ('Ĝ', &['ĝ']), + ('ĝ', &['Ĝ']), + ('Ğ', &['ğ']), + ('ğ', &['Ğ']), + ('Ġ', &['ġ']), + ('ġ', &['Ġ']), + ('Ģ', &['ģ']), + ('ģ', &['Ģ']), + ('Ĥ', &['ĥ']), + ('ĥ', &['Ĥ']), + ('Ħ', &['ħ']), + ('ħ', &['Ħ']), + ('Ĩ', &['ĩ']), + ('ĩ', &['Ĩ']), + ('Ī', &['ī']), + ('ī', &['Ī']), + ('Ĭ', &['ĭ']), + ('ĭ', &['Ĭ']), + ('Į', &['į']), + ('į', &['Į']), + ('IJ', &['ij']), + ('ij', &['IJ']), + ('Ĵ', &['ĵ']), + ('ĵ', &['Ĵ']), + ('Ķ', &['ķ']), + ('ķ', &['Ķ']), + ('Ĺ', &['ĺ']), + ('ĺ', &['Ĺ']), + ('Ļ', &['ļ']), + ('ļ', &['Ļ']), + ('Ľ', &['ľ']), + ('ľ', &['Ľ']), + ('Ŀ', &['ŀ']), + ('ŀ', &['Ŀ']), + ('Ł', &['ł']), + ('ł', &['Ł']), + ('Ń', &['ń']), + ('ń', &['Ń']), + ('Ņ', &['ņ']), + ('ņ', &['Ņ']), + ('Ň', &['ň']), + ('ň', &['Ň']), + ('Ŋ', &['ŋ']), + ('ŋ', &['Ŋ']), + ('Ō', &['ō']), + ('ō', &['Ō']), + ('Ŏ', &['ŏ']), + ('ŏ', &['Ŏ']), + ('Ő', &['ő']), + ('ő', &['Ő']), + ('Œ', &['œ']), + ('œ', &['Œ']), + ('Ŕ', &['ŕ']), + ('ŕ', &['Ŕ']), + ('Ŗ', &['ŗ']), + ('ŗ', &['Ŗ']), + ('Ř', &['ř']), + ('ř', &['Ř']), + ('Ś', &['ś']), + ('ś', &['Ś']), + ('Ŝ', &['ŝ']), + ('ŝ', &['Ŝ']), + ('Ş', &['ş']), + ('ş', &['Ş']), + ('Š', &['š']), + ('š', &['Š']), + ('Ţ', &['ţ']), + ('ţ', &['Ţ']), + ('Ť', &['ť']), + ('ť', &['Ť']), + ('Ŧ', &['ŧ']), + ('ŧ', &['Ŧ']), + ('Ũ', &['ũ']), + ('ũ', &['Ũ']), + ('Ū', &['ū']), + ('ū', &['Ū']), + ('Ŭ', &['ŭ']), + ('ŭ', &['Ŭ']), + ('Ů', &['ů']), + ('ů', &['Ů']), + ('Ű', &['ű']), + ('ű', &['Ű']), + ('Ų', &['ų']), + ('ų', &['Ų']), + ('Ŵ', &['ŵ']), + ('ŵ', &['Ŵ']), + ('Ŷ', &['ŷ']), + ('ŷ', &['Ŷ']), + ('Ÿ', &['ÿ']), + ('Ź', &['ź']), + ('ź', &['Ź']), + ('Ż', &['ż']), + ('ż', &['Ż']), + ('Ž', &['ž']), + ('ž', &['Ž']), + ('ſ', &['S', 's']), + ('ƀ', &['Ƀ']), + ('Ɓ', &['ɓ']), + ('Ƃ', &['ƃ']), + ('ƃ', &['Ƃ']), + ('Ƅ', &['ƅ']), + ('ƅ', &['Ƅ']), + ('Ɔ', &['ɔ']), + ('Ƈ', &['ƈ']), + ('ƈ', &['Ƈ']), + ('Ɖ', &['ɖ']), + ('Ɗ', &['ɗ']), + ('Ƌ', &['ƌ']), + ('ƌ', &['Ƌ']), + ('Ǝ', &['ǝ']), + ('Ə', &['ə']), + ('Ɛ', &['ɛ']), + ('Ƒ', &['ƒ']), + ('ƒ', &['Ƒ']), + ('Ɠ', &['ɠ']), + ('Ɣ', &['ɣ']), + ('ƕ', &['Ƕ']), + ('Ɩ', &['ɩ']), + ('Ɨ', &['ɨ']), + ('Ƙ', &['ƙ']), + ('ƙ', &['Ƙ']), + ('ƚ', &['Ƚ']), + ('Ɯ', &['ɯ']), + ('Ɲ', &['ɲ']), + ('ƞ', &['Ƞ']), + ('Ɵ', &['ɵ']), + ('Ơ', &['ơ']), + ('ơ', &['Ơ']), + ('Ƣ', &['ƣ']), + ('ƣ', &['Ƣ']), + ('Ƥ', &['ƥ']), + ('ƥ', &['Ƥ']), + ('Ʀ', &['ʀ']), + ('Ƨ', &['ƨ']), + ('ƨ', &['Ƨ']), + ('Ʃ', &['ʃ']), + ('Ƭ', &['ƭ']), + ('ƭ', &['Ƭ']), + ('Ʈ', &['ʈ']), + ('Ư', &['ư']), + ('ư', &['Ư']), + ('Ʊ', &['ʊ']), + ('Ʋ', &['ʋ']), + ('Ƴ', &['ƴ']), + ('ƴ', &['Ƴ']), + ('Ƶ', &['ƶ']), + ('ƶ', &['Ƶ']), + ('Ʒ', &['ʒ']), + ('Ƹ', &['ƹ']), + ('ƹ', &['Ƹ']), + ('Ƽ', &['ƽ']), + ('ƽ', &['Ƽ']), + ('ƿ', &['Ƿ']), + ('DŽ', &['Dž', 'dž']), + ('Dž', &['DŽ', 'dž']), + ('dž', &['DŽ', 'Dž']), + ('LJ', &['Lj', 'lj']), + ('Lj', &['LJ', 'lj']), + ('lj', &['LJ', 'Lj']), + ('NJ', &['Nj', 'nj']), + ('Nj', &['NJ', 'nj']), + ('nj', &['NJ', 'Nj']), + ('Ǎ', &['ǎ']), + ('ǎ', &['Ǎ']), + ('Ǐ', &['ǐ']), + ('ǐ', &['Ǐ']), + ('Ǒ', &['ǒ']), + ('ǒ', &['Ǒ']), + ('Ǔ', &['ǔ']), + ('ǔ', &['Ǔ']), + ('Ǖ', &['ǖ']), + ('ǖ', &['Ǖ']), + ('Ǘ', &['ǘ']), + ('ǘ', &['Ǘ']), + ('Ǚ', &['ǚ']), + ('ǚ', &['Ǚ']), + ('Ǜ', &['ǜ']), + ('ǜ', &['Ǜ']), + ('ǝ', &['Ǝ']), + ('Ǟ', &['ǟ']), + ('ǟ', &['Ǟ']), + ('Ǡ', &['ǡ']), + ('ǡ', &['Ǡ']), + ('Ǣ', &['ǣ']), + ('ǣ', &['Ǣ']), + ('Ǥ', &['ǥ']), + ('ǥ', &['Ǥ']), + ('Ǧ', &['ǧ']), + ('ǧ', &['Ǧ']), + ('Ǩ', &['ǩ']), + ('ǩ', &['Ǩ']), + ('Ǫ', &['ǫ']), + ('ǫ', &['Ǫ']), + ('Ǭ', &['ǭ']), + ('ǭ', &['Ǭ']), + ('Ǯ', &['ǯ']), + ('ǯ', &['Ǯ']), + ('DZ', &['Dz', 'dz']), + ('Dz', &['DZ', 'dz']), + ('dz', &['DZ', 'Dz']), + ('Ǵ', &['ǵ']), + ('ǵ', &['Ǵ']), + ('Ƕ', &['ƕ']), + ('Ƿ', &['ƿ']), + ('Ǹ', &['ǹ']), + ('ǹ', &['Ǹ']), + ('Ǻ', &['ǻ']), + ('ǻ', &['Ǻ']), + ('Ǽ', &['ǽ']), + ('ǽ', &['Ǽ']), + ('Ǿ', &['ǿ']), + ('ǿ', &['Ǿ']), + ('Ȁ', &['ȁ']), + ('ȁ', &['Ȁ']), + ('Ȃ', &['ȃ']), + ('ȃ', &['Ȃ']), + ('Ȅ', &['ȅ']), + ('ȅ', &['Ȅ']), + ('Ȇ', &['ȇ']), + ('ȇ', &['Ȇ']), + ('Ȉ', &['ȉ']), + ('ȉ', &['Ȉ']), + ('Ȋ', &['ȋ']), + ('ȋ', &['Ȋ']), + ('Ȍ', &['ȍ']), + ('ȍ', &['Ȍ']), + ('Ȏ', &['ȏ']), + ('ȏ', &['Ȏ']), + ('Ȑ', &['ȑ']), + ('ȑ', &['Ȑ']), + ('Ȓ', &['ȓ']), + ('ȓ', &['Ȓ']), + ('Ȕ', &['ȕ']), + ('ȕ', &['Ȕ']), + ('Ȗ', &['ȗ']), + ('ȗ', &['Ȗ']), + ('Ș', &['ș']), + ('ș', &['Ș']), + ('Ț', &['ț']), + ('ț', &['Ț']), + ('Ȝ', &['ȝ']), + ('ȝ', &['Ȝ']), + ('Ȟ', &['ȟ']), + ('ȟ', &['Ȟ']), + ('Ƞ', &['ƞ']), + ('Ȣ', &['ȣ']), + ('ȣ', &['Ȣ']), + ('Ȥ', &['ȥ']), + ('ȥ', &['Ȥ']), + ('Ȧ', &['ȧ']), + ('ȧ', &['Ȧ']), + ('Ȩ', &['ȩ']), + ('ȩ', &['Ȩ']), + ('Ȫ', &['ȫ']), + ('ȫ', &['Ȫ']), + ('Ȭ', &['ȭ']), + ('ȭ', &['Ȭ']), + ('Ȯ', &['ȯ']), + ('ȯ', &['Ȯ']), + ('Ȱ', &['ȱ']), + ('ȱ', &['Ȱ']), + ('Ȳ', &['ȳ']), + ('ȳ', &['Ȳ']), + ('Ⱥ', &['ⱥ']), + ('Ȼ', &['ȼ']), + ('ȼ', &['Ȼ']), + ('Ƚ', &['ƚ']), + ('Ⱦ', &['ⱦ']), + ('ȿ', &['Ȿ']), + ('ɀ', &['Ɀ']), + ('Ɂ', &['ɂ']), + ('ɂ', &['Ɂ']), + ('Ƀ', &['ƀ']), + ('Ʉ', &['ʉ']), + ('Ʌ', &['ʌ']), + ('Ɇ', &['ɇ']), + ('ɇ', &['Ɇ']), + ('Ɉ', &['ɉ']), + ('ɉ', &['Ɉ']), + ('Ɋ', &['ɋ']), + ('ɋ', &['Ɋ']), + ('Ɍ', &['ɍ']), + ('ɍ', &['Ɍ']), + ('Ɏ', &['ɏ']), + ('ɏ', &['Ɏ']), + ('ɐ', &['Ɐ']), + ('ɑ', &['Ɑ']), + ('ɒ', &['Ɒ']), + ('ɓ', &['Ɓ']), + ('ɔ', &['Ɔ']), + ('ɖ', &['Ɖ']), + ('ɗ', &['Ɗ']), + ('ə', &['Ə']), + ('ɛ', &['Ɛ']), + ('ɜ', &['Ɜ']), + ('ɠ', &['Ɠ']), + ('ɡ', &['Ɡ']), + ('ɣ', &['Ɣ']), + ('ɥ', &['Ɥ']), + ('ɦ', &['Ɦ']), + ('ɨ', &['Ɨ']), + ('ɩ', &['Ɩ']), + ('ɪ', &['Ɪ']), + ('ɫ', &['Ɫ']), + ('ɬ', &['Ɬ']), + ('ɯ', &['Ɯ']), + ('ɱ', &['Ɱ']), + ('ɲ', &['Ɲ']), + ('ɵ', &['Ɵ']), + ('ɽ', &['Ɽ']), + ('ʀ', &['Ʀ']), + ('ʃ', &['Ʃ']), + ('ʇ', &['Ʇ']), + ('ʈ', &['Ʈ']), + ('ʉ', &['Ʉ']), + ('ʊ', &['Ʊ']), + ('ʋ', &['Ʋ']), + ('ʌ', &['Ʌ']), + ('ʒ', &['Ʒ']), + ('ʝ', &['Ʝ']), + ('ʞ', &['Ʞ']), + ('ͅ', &['Ι', 'ι', 'ι']), + ('Ͱ', &['ͱ']), + ('ͱ', &['Ͱ']), + ('Ͳ', &['ͳ']), + ('ͳ', &['Ͳ']), + ('Ͷ', &['ͷ']), + ('ͷ', &['Ͷ']), + ('ͻ', &['Ͻ']), + ('ͼ', &['Ͼ']), + ('ͽ', &['Ͽ']), + ('Ϳ', &['ϳ']), + ('Ά', &['ά']), + ('Έ', &['έ']), + ('Ή', &['ή']), + ('Ί', &['ί']), + ('Ό', &['ό']), + ('Ύ', &['ύ']), + ('Ώ', &['ώ']), + ('Α', &['α']), + ('Β', &['β', 'ϐ']), + ('Γ', &['γ']), + ('Δ', &['δ']), + ('Ε', &['ε', 'ϵ']), + ('Ζ', &['ζ']), + ('Η', &['η']), + ('Θ', &['θ', 'ϑ', 'ϴ']), + ('Ι', &['ͅ', 'ι', 'ι']), + ('Κ', &['κ', 'ϰ']), + ('Λ', &['λ']), + ('Μ', &['µ', 'μ']), + ('Ν', &['ν']), + ('Ξ', &['ξ']), + ('Ο', &['ο']), + ('Π', &['π', 'ϖ']), + ('Ρ', &['ρ', 'ϱ']), + ('Σ', &['ς', 'σ']), + ('Τ', &['τ']), + ('Υ', &['υ']), + ('Φ', &['φ', 'ϕ']), + ('Χ', &['χ']), + ('Ψ', &['ψ']), + ('Ω', &['ω', 'Ω']), + ('Ϊ', &['ϊ']), + ('Ϋ', &['ϋ']), + ('ά', &['Ά']), + ('έ', &['Έ']), + ('ή', &['Ή']), + ('ί', &['Ί']), + ('α', &['Α']), + ('β', &['Β', 'ϐ']), + ('γ', &['Γ']), + ('δ', &['Δ']), + ('ε', &['Ε', 'ϵ']), + ('ζ', &['Ζ']), + ('η', &['Η']), + ('θ', &['Θ', 'ϑ', 'ϴ']), + ('ι', &['ͅ', 'Ι', 'ι']), + ('κ', &['Κ', 'ϰ']), + ('λ', &['Λ']), + ('μ', &['µ', 'Μ']), + ('ν', &['Ν']), + ('ξ', &['Ξ']), + ('ο', &['Ο']), + ('π', &['Π', 'ϖ']), + ('ρ', &['Ρ', 'ϱ']), + ('ς', &['Σ', 'σ']), + ('σ', &['Σ', 'ς']), + ('τ', &['Τ']), + ('υ', &['Υ']), + ('φ', &['Φ', 'ϕ']), + ('χ', &['Χ']), + ('ψ', &['Ψ']), + ('ω', &['Ω', 'Ω']), + ('ϊ', &['Ϊ']), + ('ϋ', &['Ϋ']), + ('ό', &['Ό']), + ('ύ', &['Ύ']), + ('ώ', &['Ώ']), + ('Ϗ', &['ϗ']), + ('ϐ', &['Β', 'β']), + ('ϑ', &['Θ', 'θ', 'ϴ']), + ('ϕ', &['Φ', 'φ']), + ('ϖ', &['Π', 'π']), + ('ϗ', &['Ϗ']), + ('Ϙ', &['ϙ']), + ('ϙ', &['Ϙ']), + ('Ϛ', &['ϛ']), + ('ϛ', &['Ϛ']), + ('Ϝ', &['ϝ']), + ('ϝ', &['Ϝ']), + ('Ϟ', &['ϟ']), + ('ϟ', &['Ϟ']), + ('Ϡ', &['ϡ']), + ('ϡ', &['Ϡ']), + ('Ϣ', &['ϣ']), + ('ϣ', &['Ϣ']), + ('Ϥ', &['ϥ']), + ('ϥ', &['Ϥ']), + ('Ϧ', &['ϧ']), + ('ϧ', &['Ϧ']), + ('Ϩ', &['ϩ']), + ('ϩ', &['Ϩ']), + ('Ϫ', &['ϫ']), + ('ϫ', &['Ϫ']), + ('Ϭ', &['ϭ']), + ('ϭ', &['Ϭ']), + ('Ϯ', &['ϯ']), + ('ϯ', &['Ϯ']), + ('ϰ', &['Κ', 'κ']), + ('ϱ', &['Ρ', 'ρ']), + ('ϲ', &['Ϲ']), + ('ϳ', &['Ϳ']), + ('ϴ', &['Θ', 'θ', 'ϑ']), + ('ϵ', &['Ε', 'ε']), + ('Ϸ', &['ϸ']), + ('ϸ', &['Ϸ']), + ('Ϲ', &['ϲ']), + ('Ϻ', &['ϻ']), + ('ϻ', &['Ϻ']), + ('Ͻ', &['ͻ']), + ('Ͼ', &['ͼ']), + ('Ͽ', &['ͽ']), + ('Ѐ', &['ѐ']), + ('Ё', &['ё']), + ('Ђ', &['ђ']), + ('Ѓ', &['ѓ']), + ('Є', &['є']), + ('Ѕ', &['ѕ']), + ('І', &['і']), + ('Ї', &['ї']), + ('Ј', &['ј']), + ('Љ', &['љ']), + ('Њ', &['њ']), + ('Ћ', &['ћ']), + ('Ќ', &['ќ']), + ('Ѝ', &['ѝ']), + ('Ў', &['ў']), + ('Џ', &['џ']), + ('А', &['а']), + ('Б', &['б']), + ('В', &['в', 'ᲀ']), + ('Г', &['г']), + ('Д', &['д', 'ᲁ']), + ('Е', &['е']), + ('Ж', &['ж']), + ('З', &['з']), + ('И', &['и']), + ('Й', &['й']), + ('К', &['к']), + ('Л', &['л']), + ('М', &['м']), + ('Н', &['н']), + ('О', &['о', 'ᲂ']), + ('П', &['п']), + ('Р', &['р']), + ('С', &['с', 'ᲃ']), + ('Т', &['т', 'ᲄ', 'ᲅ']), + ('У', &['у']), + ('Ф', &['ф']), + ('Х', &['х']), + ('Ц', &['ц']), + ('Ч', &['ч']), + ('Ш', &['ш']), + ('Щ', &['щ']), + ('Ъ', &['ъ', 'ᲆ']), + ('Ы', &['ы']), + ('Ь', &['ь']), + ('Э', &['э']), + ('Ю', &['ю']), + ('Я', &['я']), + ('а', &['А']), + ('б', &['Б']), + ('в', &['В', 'ᲀ']), + ('г', &['Г']), + ('д', &['Д', 'ᲁ']), + ('е', &['Е']), + ('ж', &['Ж']), + ('з', &['З']), + ('и', &['И']), + ('й', &['Й']), + ('к', &['К']), + ('л', &['Л']), + ('м', &['М']), + ('н', &['Н']), + ('о', &['О', 'ᲂ']), + ('п', &['П']), + ('р', &['Р']), + ('с', &['С', 'ᲃ']), + ('т', &['Т', 'ᲄ', 'ᲅ']), + ('у', &['У']), + ('ф', &['Ф']), + ('х', &['Х']), + ('ц', &['Ц']), + ('ч', &['Ч']), + ('ш', &['Ш']), + ('щ', &['Щ']), + ('ъ', &['Ъ', 'ᲆ']), + ('ы', &['Ы']), + ('ь', &['Ь']), + ('э', &['Э']), + ('ю', &['Ю']), + ('я', &['Я']), + ('ѐ', &['Ѐ']), + ('ё', &['Ё']), + ('ђ', &['Ђ']), + ('ѓ', &['Ѓ']), + ('є', &['Є']), + ('ѕ', &['Ѕ']), + ('і', &['І']), + ('ї', &['Ї']), + ('ј', &['Ј']), + ('љ', &['Љ']), + ('њ', &['Њ']), + ('ћ', &['Ћ']), + ('ќ', &['Ќ']), + ('ѝ', &['Ѝ']), + ('ў', &['Ў']), + ('џ', &['Џ']), + ('Ѡ', &['ѡ']), + ('ѡ', &['Ѡ']), + ('Ѣ', &['ѣ', 'ᲇ']), + ('ѣ', &['Ѣ', 'ᲇ']), + ('Ѥ', &['ѥ']), + ('ѥ', &['Ѥ']), + ('Ѧ', &['ѧ']), + ('ѧ', &['Ѧ']), + ('Ѩ', &['ѩ']), + ('ѩ', &['Ѩ']), + ('Ѫ', &['ѫ']), + ('ѫ', &['Ѫ']), + ('Ѭ', &['ѭ']), + ('ѭ', &['Ѭ']), + ('Ѯ', &['ѯ']), + ('ѯ', &['Ѯ']), + ('Ѱ', &['ѱ']), + ('ѱ', &['Ѱ']), + ('Ѳ', &['ѳ']), + ('ѳ', &['Ѳ']), + ('Ѵ', &['ѵ']), + ('ѵ', &['Ѵ']), + ('Ѷ', &['ѷ']), + ('ѷ', &['Ѷ']), + ('Ѹ', &['ѹ']), + ('ѹ', &['Ѹ']), + ('Ѻ', &['ѻ']), + ('ѻ', &['Ѻ']), + ('Ѽ', &['ѽ']), + ('ѽ', &['Ѽ']), + ('Ѿ', &['ѿ']), + ('ѿ', &['Ѿ']), + ('Ҁ', &['ҁ']), + ('ҁ', &['Ҁ']), + ('Ҋ', &['ҋ']), + ('ҋ', &['Ҋ']), + ('Ҍ', &['ҍ']), + ('ҍ', &['Ҍ']), + ('Ҏ', &['ҏ']), + ('ҏ', &['Ҏ']), + ('Ґ', &['ґ']), + ('ґ', &['Ґ']), + ('Ғ', &['ғ']), + ('ғ', &['Ғ']), + ('Ҕ', &['ҕ']), + ('ҕ', &['Ҕ']), + ('Җ', &['җ']), + ('җ', &['Җ']), + ('Ҙ', &['ҙ']), + ('ҙ', &['Ҙ']), + ('Қ', &['қ']), + ('қ', &['Қ']), + ('Ҝ', &['ҝ']), + ('ҝ', &['Ҝ']), + ('Ҟ', &['ҟ']), + ('ҟ', &['Ҟ']), + ('Ҡ', &['ҡ']), + ('ҡ', &['Ҡ']), + ('Ң', &['ң']), + ('ң', &['Ң']), + ('Ҥ', &['ҥ']), + ('ҥ', &['Ҥ']), + ('Ҧ', &['ҧ']), + ('ҧ', &['Ҧ']), + ('Ҩ', &['ҩ']), + ('ҩ', &['Ҩ']), + ('Ҫ', &['ҫ']), + ('ҫ', &['Ҫ']), + ('Ҭ', &['ҭ']), + ('ҭ', &['Ҭ']), + ('Ү', &['ү']), + ('ү', &['Ү']), + ('Ұ', &['ұ']), + ('ұ', &['Ұ']), + ('Ҳ', &['ҳ']), + ('ҳ', &['Ҳ']), + ('Ҵ', &['ҵ']), + ('ҵ', &['Ҵ']), + ('Ҷ', &['ҷ']), + ('ҷ', &['Ҷ']), + ('Ҹ', &['ҹ']), + ('ҹ', &['Ҹ']), + ('Һ', &['һ']), + ('һ', &['Һ']), + ('Ҽ', &['ҽ']), + ('ҽ', &['Ҽ']), + ('Ҿ', &['ҿ']), + ('ҿ', &['Ҿ']), + ('Ӏ', &['ӏ']), + ('Ӂ', &['ӂ']), + ('ӂ', &['Ӂ']), + ('Ӄ', &['ӄ']), + ('ӄ', &['Ӄ']), + ('Ӆ', &['ӆ']), + ('ӆ', &['Ӆ']), + ('Ӈ', &['ӈ']), + ('ӈ', &['Ӈ']), + ('Ӊ', &['ӊ']), + ('ӊ', &['Ӊ']), + ('Ӌ', &['ӌ']), + ('ӌ', &['Ӌ']), + ('Ӎ', &['ӎ']), + ('ӎ', &['Ӎ']), + ('ӏ', &['Ӏ']), + ('Ӑ', &['ӑ']), + ('ӑ', &['Ӑ']), + ('Ӓ', &['ӓ']), + ('ӓ', &['Ӓ']), + ('Ӕ', &['ӕ']), + ('ӕ', &['Ӕ']), + ('Ӗ', &['ӗ']), + ('ӗ', &['Ӗ']), + ('Ә', &['ә']), + ('ә', &['Ә']), + ('Ӛ', &['ӛ']), + ('ӛ', &['Ӛ']), + ('Ӝ', &['ӝ']), + ('ӝ', &['Ӝ']), + ('Ӟ', &['ӟ']), + ('ӟ', &['Ӟ']), + ('Ӡ', &['ӡ']), + ('ӡ', &['Ӡ']), + ('Ӣ', &['ӣ']), + ('ӣ', &['Ӣ']), + ('Ӥ', &['ӥ']), + ('ӥ', &['Ӥ']), + ('Ӧ', &['ӧ']), + ('ӧ', &['Ӧ']), + ('Ө', &['ө']), + ('ө', &['Ө']), + ('Ӫ', &['ӫ']), + ('ӫ', &['Ӫ']), + ('Ӭ', &['ӭ']), + ('ӭ', &['Ӭ']), + ('Ӯ', &['ӯ']), + ('ӯ', &['Ӯ']), + ('Ӱ', &['ӱ']), + ('ӱ', &['Ӱ']), + ('Ӳ', &['ӳ']), + ('ӳ', &['Ӳ']), + ('Ӵ', &['ӵ']), + ('ӵ', &['Ӵ']), + ('Ӷ', &['ӷ']), + ('ӷ', &['Ӷ']), + ('Ӹ', &['ӹ']), + ('ӹ', &['Ӹ']), + ('Ӻ', &['ӻ']), + ('ӻ', &['Ӻ']), + ('Ӽ', &['ӽ']), + ('ӽ', &['Ӽ']), + ('Ӿ', &['ӿ']), + ('ӿ', &['Ӿ']), + ('Ԁ', &['ԁ']), + ('ԁ', &['Ԁ']), + ('Ԃ', &['ԃ']), + ('ԃ', &['Ԃ']), + ('Ԅ', &['ԅ']), + ('ԅ', &['Ԅ']), + ('Ԇ', &['ԇ']), + ('ԇ', &['Ԇ']), + ('Ԉ', &['ԉ']), + ('ԉ', &['Ԉ']), + ('Ԋ', &['ԋ']), + ('ԋ', &['Ԋ']), + ('Ԍ', &['ԍ']), + ('ԍ', &['Ԍ']), + ('Ԏ', &['ԏ']), + ('ԏ', &['Ԏ']), + ('Ԑ', &['ԑ']), + ('ԑ', &['Ԑ']), + ('Ԓ', &['ԓ']), + ('ԓ', &['Ԓ']), + ('Ԕ', &['ԕ']), + ('ԕ', &['Ԕ']), + ('Ԗ', &['ԗ']), + ('ԗ', &['Ԗ']), + ('Ԙ', &['ԙ']), + ('ԙ', &['Ԙ']), + ('Ԛ', &['ԛ']), + ('ԛ', &['Ԛ']), + ('Ԝ', &['ԝ']), + ('ԝ', &['Ԝ']), + ('Ԟ', &['ԟ']), + ('ԟ', &['Ԟ']), + ('Ԡ', &['ԡ']), + ('ԡ', &['Ԡ']), + ('Ԣ', &['ԣ']), + ('ԣ', &['Ԣ']), + ('Ԥ', &['ԥ']), + ('ԥ', &['Ԥ']), + ('Ԧ', &['ԧ']), + ('ԧ', &['Ԧ']), + ('Ԩ', &['ԩ']), + ('ԩ', &['Ԩ']), + ('Ԫ', &['ԫ']), + ('ԫ', &['Ԫ']), + ('Ԭ', &['ԭ']), + ('ԭ', &['Ԭ']), + ('Ԯ', &['ԯ']), + ('ԯ', &['Ԯ']), + ('Ա', &['ա']), + ('Բ', &['բ']), + ('Գ', &['գ']), + ('Դ', &['դ']), + ('Ե', &['ե']), + ('Զ', &['զ']), + ('Է', &['է']), + ('Ը', &['ը']), + ('Թ', &['թ']), + ('Ժ', &['ժ']), + ('Ի', &['ի']), + ('Լ', &['լ']), + ('Խ', &['խ']), + ('Ծ', &['ծ']), + ('Կ', &['կ']), + ('Հ', &['հ']), + ('Ձ', &['ձ']), + ('Ղ', &['ղ']), + ('Ճ', &['ճ']), + ('Մ', &['մ']), + ('Յ', &['յ']), + ('Ն', &['ն']), + ('Շ', &['շ']), + ('Ո', &['ո']), + ('Չ', &['չ']), + ('Պ', &['պ']), + ('Ջ', &['ջ']), + ('Ռ', &['ռ']), + ('Ս', &['ս']), + ('Վ', &['վ']), + ('Տ', &['տ']), + ('Ր', &['ր']), + ('Ց', &['ց']), + ('Ւ', &['ւ']), + ('Փ', &['փ']), + ('Ք', &['ք']), + ('Օ', &['օ']), + ('Ֆ', &['ֆ']), + ('ա', &['Ա']), + ('բ', &['Բ']), + ('գ', &['Գ']), + ('դ', &['Դ']), + ('ե', &['Ե']), + ('զ', &['Զ']), + ('է', &['Է']), + ('ը', &['Ը']), + ('թ', &['Թ']), + ('ժ', &['Ժ']), + ('ի', &['Ի']), + ('լ', &['Լ']), + ('խ', &['Խ']), + ('ծ', &['Ծ']), + ('կ', &['Կ']), + ('հ', &['Հ']), + ('ձ', &['Ձ']), + ('ղ', &['Ղ']), + ('ճ', &['Ճ']), + ('մ', &['Մ']), + ('յ', &['Յ']), + ('ն', &['Ն']), + ('շ', &['Շ']), + ('ո', &['Ո']), + ('չ', &['Չ']), + ('պ', &['Պ']), + ('ջ', &['Ջ']), + ('ռ', &['Ռ']), + ('ս', &['Ս']), + ('վ', &['Վ']), + ('տ', &['Տ']), + ('ր', &['Ր']), + ('ց', &['Ց']), + ('ւ', &['Ւ']), + ('փ', &['Փ']), + ('ք', &['Ք']), + ('օ', &['Օ']), + ('ֆ', &['Ֆ']), + ('Ⴀ', &['ⴀ']), + ('Ⴁ', &['ⴁ']), + ('Ⴂ', &['ⴂ']), + ('Ⴃ', &['ⴃ']), + ('Ⴄ', &['ⴄ']), + ('Ⴅ', &['ⴅ']), + ('Ⴆ', &['ⴆ']), + ('Ⴇ', &['ⴇ']), + ('Ⴈ', &['ⴈ']), + ('Ⴉ', &['ⴉ']), + ('Ⴊ', &['ⴊ']), + ('Ⴋ', &['ⴋ']), + ('Ⴌ', &['ⴌ']), + ('Ⴍ', &['ⴍ']), + ('Ⴎ', &['ⴎ']), + ('Ⴏ', &['ⴏ']), + ('Ⴐ', &['ⴐ']), + ('Ⴑ', &['ⴑ']), + ('Ⴒ', &['ⴒ']), + ('Ⴓ', &['ⴓ']), + ('Ⴔ', &['ⴔ']), + ('Ⴕ', &['ⴕ']), + ('Ⴖ', &['ⴖ']), + ('Ⴗ', &['ⴗ']), + ('Ⴘ', &['ⴘ']), + ('Ⴙ', &['ⴙ']), + ('Ⴚ', &['ⴚ']), + ('Ⴛ', &['ⴛ']), + ('Ⴜ', &['ⴜ']), + ('Ⴝ', &['ⴝ']), + ('Ⴞ', &['ⴞ']), + ('Ⴟ', &['ⴟ']), + ('Ⴠ', &['ⴠ']), + ('Ⴡ', &['ⴡ']), + ('Ⴢ', &['ⴢ']), + ('Ⴣ', &['ⴣ']), + ('Ⴤ', &['ⴤ']), + ('Ⴥ', &['ⴥ']), + ('Ⴧ', &['ⴧ']), + ('Ⴭ', &['ⴭ']), + ('Ꭰ', &['ꭰ']), + ('Ꭱ', &['ꭱ']), + ('Ꭲ', &['ꭲ']), + ('Ꭳ', &['ꭳ']), + ('Ꭴ', &['ꭴ']), + ('Ꭵ', &['ꭵ']), + ('Ꭶ', &['ꭶ']), + ('Ꭷ', &['ꭷ']), + ('Ꭸ', &['ꭸ']), + ('Ꭹ', &['ꭹ']), + ('Ꭺ', &['ꭺ']), + ('Ꭻ', &['ꭻ']), + ('Ꭼ', &['ꭼ']), + ('Ꭽ', &['ꭽ']), + ('Ꭾ', &['ꭾ']), + ('Ꭿ', &['ꭿ']), + ('Ꮀ', &['ꮀ']), + ('Ꮁ', &['ꮁ']), + ('Ꮂ', &['ꮂ']), + ('Ꮃ', &['ꮃ']), + ('Ꮄ', &['ꮄ']), + ('Ꮅ', &['ꮅ']), + ('Ꮆ', &['ꮆ']), + ('Ꮇ', &['ꮇ']), + ('Ꮈ', &['ꮈ']), + ('Ꮉ', &['ꮉ']), + ('Ꮊ', &['ꮊ']), + ('Ꮋ', &['ꮋ']), + ('Ꮌ', &['ꮌ']), + ('Ꮍ', &['ꮍ']), + ('Ꮎ', &['ꮎ']), + ('Ꮏ', &['ꮏ']), + ('Ꮐ', &['ꮐ']), + ('Ꮑ', &['ꮑ']), + ('Ꮒ', &['ꮒ']), + ('Ꮓ', &['ꮓ']), + ('Ꮔ', &['ꮔ']), + ('Ꮕ', &['ꮕ']), + ('Ꮖ', &['ꮖ']), + ('Ꮗ', &['ꮗ']), + ('Ꮘ', &['ꮘ']), + ('Ꮙ', &['ꮙ']), + ('Ꮚ', &['ꮚ']), + ('Ꮛ', &['ꮛ']), + ('Ꮜ', &['ꮜ']), + ('Ꮝ', &['ꮝ']), + ('Ꮞ', &['ꮞ']), + ('Ꮟ', &['ꮟ']), + ('Ꮠ', &['ꮠ']), + ('Ꮡ', &['ꮡ']), + ('Ꮢ', &['ꮢ']), + ('Ꮣ', &['ꮣ']), + ('Ꮤ', &['ꮤ']), + ('Ꮥ', &['ꮥ']), + ('Ꮦ', &['ꮦ']), + ('Ꮧ', &['ꮧ']), + ('Ꮨ', &['ꮨ']), + ('Ꮩ', &['ꮩ']), + ('Ꮪ', &['ꮪ']), + ('Ꮫ', &['ꮫ']), + ('Ꮬ', &['ꮬ']), + ('Ꮭ', &['ꮭ']), + ('Ꮮ', &['ꮮ']), + ('Ꮯ', &['ꮯ']), + ('Ꮰ', &['ꮰ']), + ('Ꮱ', &['ꮱ']), + ('Ꮲ', &['ꮲ']), + ('Ꮳ', &['ꮳ']), + ('Ꮴ', &['ꮴ']), + ('Ꮵ', &['ꮵ']), + ('Ꮶ', &['ꮶ']), + ('Ꮷ', &['ꮷ']), + ('Ꮸ', &['ꮸ']), + ('Ꮹ', &['ꮹ']), + ('Ꮺ', &['ꮺ']), + ('Ꮻ', &['ꮻ']), + ('Ꮼ', &['ꮼ']), + ('Ꮽ', &['ꮽ']), + ('Ꮾ', &['ꮾ']), + ('Ꮿ', &['ꮿ']), + ('Ᏸ', &['ᏸ']), + ('Ᏹ', &['ᏹ']), + ('Ᏺ', &['ᏺ']), + ('Ᏻ', &['ᏻ']), + ('Ᏼ', &['ᏼ']), + ('Ᏽ', &['ᏽ']), + ('ᏸ', &['Ᏸ']), + ('ᏹ', &['Ᏹ']), + ('ᏺ', &['Ᏺ']), + ('ᏻ', &['Ᏻ']), + ('ᏼ', &['Ᏼ']), + ('ᏽ', &['Ᏽ']), + ('ᲀ', &['В', 'в']), + ('ᲁ', &['Д', 'д']), + ('ᲂ', &['О', 'о']), + ('ᲃ', &['С', 'с']), + ('ᲄ', &['Т', 'т', 'ᲅ']), + ('ᲅ', &['Т', 'т', 'ᲄ']), + ('ᲆ', &['Ъ', 'ъ']), + ('ᲇ', &['Ѣ', 'ѣ']), + ('ᲈ', &['Ꙋ', 'ꙋ']), + ('ᵹ', &['Ᵹ']), + ('ᵽ', &['Ᵽ']), + ('Ḁ', &['ḁ']), + ('ḁ', &['Ḁ']), + ('Ḃ', &['ḃ']), + ('ḃ', &['Ḃ']), + ('Ḅ', &['ḅ']), + ('ḅ', &['Ḅ']), + ('Ḇ', &['ḇ']), + ('ḇ', &['Ḇ']), + ('Ḉ', &['ḉ']), + ('ḉ', &['Ḉ']), + ('Ḋ', &['ḋ']), + ('ḋ', &['Ḋ']), + ('Ḍ', &['ḍ']), + ('ḍ', &['Ḍ']), + ('Ḏ', &['ḏ']), + ('ḏ', &['Ḏ']), + ('Ḑ', &['ḑ']), + ('ḑ', &['Ḑ']), + ('Ḓ', &['ḓ']), + ('ḓ', &['Ḓ']), + ('Ḕ', &['ḕ']), + ('ḕ', &['Ḕ']), + ('Ḗ', &['ḗ']), + ('ḗ', &['Ḗ']), + ('Ḙ', &['ḙ']), + ('ḙ', &['Ḙ']), + ('Ḛ', &['ḛ']), + ('ḛ', &['Ḛ']), + ('Ḝ', &['ḝ']), + ('ḝ', &['Ḝ']), + ('Ḟ', &['ḟ']), + ('ḟ', &['Ḟ']), + ('Ḡ', &['ḡ']), + ('ḡ', &['Ḡ']), + ('Ḣ', &['ḣ']), + ('ḣ', &['Ḣ']), + ('Ḥ', &['ḥ']), + ('ḥ', &['Ḥ']), + ('Ḧ', &['ḧ']), + ('ḧ', &['Ḧ']), + ('Ḩ', &['ḩ']), + ('ḩ', &['Ḩ']), + ('Ḫ', &['ḫ']), + ('ḫ', &['Ḫ']), + ('Ḭ', &['ḭ']), + ('ḭ', &['Ḭ']), + ('Ḯ', &['ḯ']), + ('ḯ', &['Ḯ']), + ('Ḱ', &['ḱ']), + ('ḱ', &['Ḱ']), + ('Ḳ', &['ḳ']), + ('ḳ', &['Ḳ']), + ('Ḵ', &['ḵ']), + ('ḵ', &['Ḵ']), + ('Ḷ', &['ḷ']), + ('ḷ', &['Ḷ']), + ('Ḹ', &['ḹ']), + ('ḹ', &['Ḹ']), + ('Ḻ', &['ḻ']), + ('ḻ', &['Ḻ']), + ('Ḽ', &['ḽ']), + ('ḽ', &['Ḽ']), + ('Ḿ', &['ḿ']), + ('ḿ', &['Ḿ']), + ('Ṁ', &['ṁ']), + ('ṁ', &['Ṁ']), + ('Ṃ', &['ṃ']), + ('ṃ', &['Ṃ']), + ('Ṅ', &['ṅ']), + ('ṅ', &['Ṅ']), + ('Ṇ', &['ṇ']), + ('ṇ', &['Ṇ']), + ('Ṉ', &['ṉ']), + ('ṉ', &['Ṉ']), + ('Ṋ', &['ṋ']), + ('ṋ', &['Ṋ']), + ('Ṍ', &['ṍ']), + ('ṍ', &['Ṍ']), + ('Ṏ', &['ṏ']), + ('ṏ', &['Ṏ']), + ('Ṑ', &['ṑ']), + ('ṑ', &['Ṑ']), + ('Ṓ', &['ṓ']), + ('ṓ', &['Ṓ']), + ('Ṕ', &['ṕ']), + ('ṕ', &['Ṕ']), + ('Ṗ', &['ṗ']), + ('ṗ', &['Ṗ']), + ('Ṙ', &['ṙ']), + ('ṙ', &['Ṙ']), + ('Ṛ', &['ṛ']), + ('ṛ', &['Ṛ']), + ('Ṝ', &['ṝ']), + ('ṝ', &['Ṝ']), + ('Ṟ', &['ṟ']), + ('ṟ', &['Ṟ']), + ('Ṡ', &['ṡ', 'ẛ']), + ('ṡ', &['Ṡ', 'ẛ']), + ('Ṣ', &['ṣ']), + ('ṣ', &['Ṣ']), + ('Ṥ', &['ṥ']), + ('ṥ', &['Ṥ']), + ('Ṧ', &['ṧ']), + ('ṧ', &['Ṧ']), + ('Ṩ', &['ṩ']), + ('ṩ', &['Ṩ']), + ('Ṫ', &['ṫ']), + ('ṫ', &['Ṫ']), + ('Ṭ', &['ṭ']), + ('ṭ', &['Ṭ']), + ('Ṯ', &['ṯ']), + ('ṯ', &['Ṯ']), + ('Ṱ', &['ṱ']), + ('ṱ', &['Ṱ']), + ('Ṳ', &['ṳ']), + ('ṳ', &['Ṳ']), + ('Ṵ', &['ṵ']), + ('ṵ', &['Ṵ']), + ('Ṷ', &['ṷ']), + ('ṷ', &['Ṷ']), + ('Ṹ', &['ṹ']), + ('ṹ', &['Ṹ']), + ('Ṻ', &['ṻ']), + ('ṻ', &['Ṻ']), + ('Ṽ', &['ṽ']), + ('ṽ', &['Ṽ']), + ('Ṿ', &['ṿ']), + ('ṿ', &['Ṿ']), + ('Ẁ', &['ẁ']), + ('ẁ', &['Ẁ']), + ('Ẃ', &['ẃ']), + ('ẃ', &['Ẃ']), + ('Ẅ', &['ẅ']), + ('ẅ', &['Ẅ']), + ('Ẇ', &['ẇ']), + ('ẇ', &['Ẇ']), + ('Ẉ', &['ẉ']), + ('ẉ', &['Ẉ']), + ('Ẋ', &['ẋ']), + ('ẋ', &['Ẋ']), + ('Ẍ', &['ẍ']), + ('ẍ', &['Ẍ']), + ('Ẏ', &['ẏ']), + ('ẏ', &['Ẏ']), + ('Ẑ', &['ẑ']), + ('ẑ', &['Ẑ']), + ('Ẓ', &['ẓ']), + ('ẓ', &['Ẓ']), + ('Ẕ', &['ẕ']), + ('ẕ', &['Ẕ']), + ('ẛ', &['Ṡ', 'ṡ']), + ('ẞ', &['ß']), + ('Ạ', &['ạ']), + ('ạ', &['Ạ']), + ('Ả', &['ả']), + ('ả', &['Ả']), + ('Ấ', &['ấ']), + ('ấ', &['Ấ']), + ('Ầ', &['ầ']), + ('ầ', &['Ầ']), + ('Ẩ', &['ẩ']), + ('ẩ', &['Ẩ']), + ('Ẫ', &['ẫ']), + ('ẫ', &['Ẫ']), + ('Ậ', &['ậ']), + ('ậ', &['Ậ']), + ('Ắ', &['ắ']), + ('ắ', &['Ắ']), + ('Ằ', &['ằ']), + ('ằ', &['Ằ']), + ('Ẳ', &['ẳ']), + ('ẳ', &['Ẳ']), + ('Ẵ', &['ẵ']), + ('ẵ', &['Ẵ']), + ('Ặ', &['ặ']), + ('ặ', &['Ặ']), + ('Ẹ', &['ẹ']), + ('ẹ', &['Ẹ']), + ('Ẻ', &['ẻ']), + ('ẻ', &['Ẻ']), + ('Ẽ', &['ẽ']), + ('ẽ', &['Ẽ']), + ('Ế', &['ế']), + ('ế', &['Ế']), + ('Ề', &['ề']), + ('ề', &['Ề']), + ('Ể', &['ể']), + ('ể', &['Ể']), + ('Ễ', &['ễ']), + ('ễ', &['Ễ']), + ('Ệ', &['ệ']), + ('ệ', &['Ệ']), + ('Ỉ', &['ỉ']), + ('ỉ', &['Ỉ']), + ('Ị', &['ị']), + ('ị', &['Ị']), + ('Ọ', &['ọ']), + ('ọ', &['Ọ']), + ('Ỏ', &['ỏ']), + ('ỏ', &['Ỏ']), + ('Ố', &['ố']), + ('ố', &['Ố']), + ('Ồ', &['ồ']), + ('ồ', &['Ồ']), + ('Ổ', &['ổ']), + ('ổ', &['Ổ']), + ('Ỗ', &['ỗ']), + ('ỗ', &['Ỗ']), + ('Ộ', &['ộ']), + ('ộ', &['Ộ']), + ('Ớ', &['ớ']), + ('ớ', &['Ớ']), + ('Ờ', &['ờ']), + ('ờ', &['Ờ']), + ('Ở', &['ở']), + ('ở', &['Ở']), + ('Ỡ', &['ỡ']), + ('ỡ', &['Ỡ']), + ('Ợ', &['ợ']), + ('ợ', &['Ợ']), + ('Ụ', &['ụ']), + ('ụ', &['Ụ']), + ('Ủ', &['ủ']), + ('ủ', &['Ủ']), + ('Ứ', &['ứ']), + ('ứ', &['Ứ']), + ('Ừ', &['ừ']), + ('ừ', &['Ừ']), + ('Ử', &['ử']), + ('ử', &['Ử']), + ('Ữ', &['ữ']), + ('ữ', &['Ữ']), + ('Ự', &['ự']), + ('ự', &['Ự']), + ('Ỳ', &['ỳ']), + ('ỳ', &['Ỳ']), + ('Ỵ', &['ỵ']), + ('ỵ', &['Ỵ']), + ('Ỷ', &['ỷ']), + ('ỷ', &['Ỷ']), + ('Ỹ', &['ỹ']), + ('ỹ', &['Ỹ']), + ('Ỻ', &['ỻ']), + ('ỻ', &['Ỻ']), + ('Ỽ', &['ỽ']), + ('ỽ', &['Ỽ']), + ('Ỿ', &['ỿ']), + ('ỿ', &['Ỿ']), + ('ἀ', &['Ἀ']), + ('ἁ', &['Ἁ']), + ('ἂ', &['Ἂ']), + ('ἃ', &['Ἃ']), + ('ἄ', &['Ἄ']), + ('ἅ', &['Ἅ']), + ('ἆ', &['Ἆ']), + ('ἇ', &['Ἇ']), + ('Ἀ', &['ἀ']), + ('Ἁ', &['ἁ']), + ('Ἂ', &['ἂ']), + ('Ἃ', &['ἃ']), + ('Ἄ', &['ἄ']), + ('Ἅ', &['ἅ']), + ('Ἆ', &['ἆ']), + ('Ἇ', &['ἇ']), + ('ἐ', &['Ἐ']), + ('ἑ', &['Ἑ']), + ('ἒ', &['Ἒ']), + ('ἓ', &['Ἓ']), + ('ἔ', &['Ἔ']), + ('ἕ', &['Ἕ']), + ('Ἐ', &['ἐ']), + ('Ἑ', &['ἑ']), + ('Ἒ', &['ἒ']), + ('Ἓ', &['ἓ']), + ('Ἔ', &['ἔ']), + ('Ἕ', &['ἕ']), + ('ἠ', &['Ἠ']), + ('ἡ', &['Ἡ']), + ('ἢ', &['Ἢ']), + ('ἣ', &['Ἣ']), + ('ἤ', &['Ἤ']), + ('ἥ', &['Ἥ']), + ('ἦ', &['Ἦ']), + ('ἧ', &['Ἧ']), + ('Ἠ', &['ἠ']), + ('Ἡ', &['ἡ']), + ('Ἢ', &['ἢ']), + ('Ἣ', &['ἣ']), + ('Ἤ', &['ἤ']), + ('Ἥ', &['ἥ']), + ('Ἦ', &['ἦ']), + ('Ἧ', &['ἧ']), + ('ἰ', &['Ἰ']), + ('ἱ', &['Ἱ']), + ('ἲ', &['Ἲ']), + ('ἳ', &['Ἳ']), + ('ἴ', &['Ἴ']), + ('ἵ', &['Ἵ']), + ('ἶ', &['Ἶ']), + ('ἷ', &['Ἷ']), + ('Ἰ', &['ἰ']), + ('Ἱ', &['ἱ']), + ('Ἲ', &['ἲ']), + ('Ἳ', &['ἳ']), + ('Ἴ', &['ἴ']), + ('Ἵ', &['ἵ']), + ('Ἶ', &['ἶ']), + ('Ἷ', &['ἷ']), + ('ὀ', &['Ὀ']), + ('ὁ', &['Ὁ']), + ('ὂ', &['Ὂ']), + ('ὃ', &['Ὃ']), + ('ὄ', &['Ὄ']), + ('ὅ', &['Ὅ']), + ('Ὀ', &['ὀ']), + ('Ὁ', &['ὁ']), + ('Ὂ', &['ὂ']), + ('Ὃ', &['ὃ']), + ('Ὄ', &['ὄ']), + ('Ὅ', &['ὅ']), + ('ὑ', &['Ὑ']), + ('ὓ', &['Ὓ']), + ('ὕ', &['Ὕ']), + ('ὗ', &['Ὗ']), + ('Ὑ', &['ὑ']), + ('Ὓ', &['ὓ']), + ('Ὕ', &['ὕ']), + ('Ὗ', &['ὗ']), + ('ὠ', &['Ὠ']), + ('ὡ', &['Ὡ']), + ('ὢ', &['Ὢ']), + ('ὣ', &['Ὣ']), + ('ὤ', &['Ὤ']), + ('ὥ', &['Ὥ']), + ('ὦ', &['Ὦ']), + ('ὧ', &['Ὧ']), + ('Ὠ', &['ὠ']), + ('Ὡ', &['ὡ']), + ('Ὢ', &['ὢ']), + ('Ὣ', &['ὣ']), + ('Ὤ', &['ὤ']), + ('Ὥ', &['ὥ']), + ('Ὦ', &['ὦ']), + ('Ὧ', &['ὧ']), + ('ὰ', &['Ὰ']), + ('ά', &['Ά']), + ('ὲ', &['Ὲ']), + ('έ', &['Έ']), + ('ὴ', &['Ὴ']), + ('ή', &['Ή']), + ('ὶ', &['Ὶ']), + ('ί', &['Ί']), + ('ὸ', &['Ὸ']), + ('ό', &['Ό']), + ('ὺ', &['Ὺ']), + ('ύ', &['Ύ']), + ('ὼ', &['Ὼ']), + ('ώ', &['Ώ']), + ('ᾀ', &['ᾈ']), + ('ᾁ', &['ᾉ']), + ('ᾂ', &['ᾊ']), + ('ᾃ', &['ᾋ']), + ('ᾄ', &['ᾌ']), + ('ᾅ', &['ᾍ']), + ('ᾆ', &['ᾎ']), + ('ᾇ', &['ᾏ']), + ('ᾈ', &['ᾀ']), + ('ᾉ', &['ᾁ']), + ('ᾊ', &['ᾂ']), + ('ᾋ', &['ᾃ']), + ('ᾌ', &['ᾄ']), + ('ᾍ', &['ᾅ']), + ('ᾎ', &['ᾆ']), + ('ᾏ', &['ᾇ']), + ('ᾐ', &['ᾘ']), + ('ᾑ', &['ᾙ']), + ('ᾒ', &['ᾚ']), + ('ᾓ', &['ᾛ']), + ('ᾔ', &['ᾜ']), + ('ᾕ', &['ᾝ']), + ('ᾖ', &['ᾞ']), + ('ᾗ', &['ᾟ']), + ('ᾘ', &['ᾐ']), + ('ᾙ', &['ᾑ']), + ('ᾚ', &['ᾒ']), + ('ᾛ', &['ᾓ']), + ('ᾜ', &['ᾔ']), + ('ᾝ', &['ᾕ']), + ('ᾞ', &['ᾖ']), + ('ᾟ', &['ᾗ']), + ('ᾠ', &['ᾨ']), + ('ᾡ', &['ᾩ']), + ('ᾢ', &['ᾪ']), + ('ᾣ', &['ᾫ']), + ('ᾤ', &['ᾬ']), + ('ᾥ', &['ᾭ']), + ('ᾦ', &['ᾮ']), + ('ᾧ', &['ᾯ']), + ('ᾨ', &['ᾠ']), + ('ᾩ', &['ᾡ']), + ('ᾪ', &['ᾢ']), + ('ᾫ', &['ᾣ']), + ('ᾬ', &['ᾤ']), + ('ᾭ', &['ᾥ']), + ('ᾮ', &['ᾦ']), + ('ᾯ', &['ᾧ']), + ('ᾰ', &['Ᾰ']), + ('ᾱ', &['Ᾱ']), + ('ᾳ', &['ᾼ']), + ('Ᾰ', &['ᾰ']), + ('Ᾱ', &['ᾱ']), + ('Ὰ', &['ὰ']), + ('Ά', &['ά']), + ('ᾼ', &['ᾳ']), + ('ι', &['ͅ', 'Ι', 'ι']), + ('ῃ', &['ῌ']), + ('Ὲ', &['ὲ']), + ('Έ', &['έ']), + ('Ὴ', &['ὴ']), + ('Ή', &['ή']), + ('ῌ', &['ῃ']), + ('ῐ', &['Ῐ']), + ('ῑ', &['Ῑ']), + ('Ῐ', &['ῐ']), + ('Ῑ', &['ῑ']), + ('Ὶ', &['ὶ']), + ('Ί', &['ί']), + ('ῠ', &['Ῠ']), + ('ῡ', &['Ῡ']), + ('ῥ', &['Ῥ']), + ('Ῠ', &['ῠ']), + ('Ῡ', &['ῡ']), + ('Ὺ', &['ὺ']), + ('Ύ', &['ύ']), + ('Ῥ', &['ῥ']), + ('ῳ', &['ῼ']), + ('Ὸ', &['ὸ']), + ('Ό', &['ό']), + ('Ὼ', &['ὼ']), + ('Ώ', &['ώ']), + ('ῼ', &['ῳ']), + ('Ω', &['Ω', 'ω']), + ('K', &['K', 'k']), + ('Å', &['Å', 'å']), + ('Ⅎ', &['ⅎ']), + ('ⅎ', &['Ⅎ']), + ('Ⅰ', &['ⅰ']), + ('Ⅱ', &['ⅱ']), + ('Ⅲ', &['ⅲ']), + ('Ⅳ', &['ⅳ']), + ('Ⅴ', &['ⅴ']), + ('Ⅵ', &['ⅵ']), + ('Ⅶ', &['ⅶ']), + ('Ⅷ', &['ⅷ']), + ('Ⅸ', &['ⅸ']), + ('Ⅹ', &['ⅹ']), + ('Ⅺ', &['ⅺ']), + ('Ⅻ', &['ⅻ']), + ('Ⅼ', &['ⅼ']), + ('Ⅽ', &['ⅽ']), + ('Ⅾ', &['ⅾ']), + ('Ⅿ', &['ⅿ']), + ('ⅰ', &['Ⅰ']), + ('ⅱ', &['Ⅱ']), + ('ⅲ', &['Ⅲ']), + ('ⅳ', &['Ⅳ']), + ('ⅴ', &['Ⅴ']), + ('ⅵ', &['Ⅵ']), + ('ⅶ', &['Ⅶ']), + ('ⅷ', &['Ⅷ']), + ('ⅸ', &['Ⅸ']), + ('ⅹ', &['Ⅹ']), + ('ⅺ', &['Ⅺ']), + ('ⅻ', &['Ⅻ']), + ('ⅼ', &['Ⅼ']), + ('ⅽ', &['Ⅽ']), + ('ⅾ', &['Ⅾ']), + ('ⅿ', &['Ⅿ']), + ('Ↄ', &['ↄ']), + ('ↄ', &['Ↄ']), + ('Ⓐ', &['ⓐ']), + ('Ⓑ', &['ⓑ']), + ('Ⓒ', &['ⓒ']), + ('Ⓓ', &['ⓓ']), + ('Ⓔ', &['ⓔ']), + ('Ⓕ', &['ⓕ']), + ('Ⓖ', &['ⓖ']), + ('Ⓗ', &['ⓗ']), + ('Ⓘ', &['ⓘ']), + ('Ⓙ', &['ⓙ']), + ('Ⓚ', &['ⓚ']), + ('Ⓛ', &['ⓛ']), + ('Ⓜ', &['ⓜ']), + ('Ⓝ', &['ⓝ']), + ('Ⓞ', &['ⓞ']), + ('Ⓟ', &['ⓟ']), + ('Ⓠ', &['ⓠ']), + ('Ⓡ', &['ⓡ']), + ('Ⓢ', &['ⓢ']), + ('Ⓣ', &['ⓣ']), + ('Ⓤ', &['ⓤ']), + ('Ⓥ', &['ⓥ']), + ('Ⓦ', &['ⓦ']), + ('Ⓧ', &['ⓧ']), + ('Ⓨ', &['ⓨ']), + ('Ⓩ', &['ⓩ']), + ('ⓐ', &['Ⓐ']), + ('ⓑ', &['Ⓑ']), + ('ⓒ', &['Ⓒ']), + ('ⓓ', &['Ⓓ']), + ('ⓔ', &['Ⓔ']), + ('ⓕ', &['Ⓕ']), + ('ⓖ', &['Ⓖ']), + ('ⓗ', &['Ⓗ']), + ('ⓘ', &['Ⓘ']), + ('ⓙ', &['Ⓙ']), + ('ⓚ', &['Ⓚ']), + ('ⓛ', &['Ⓛ']), + ('ⓜ', &['Ⓜ']), + ('ⓝ', &['Ⓝ']), + ('ⓞ', &['Ⓞ']), + ('ⓟ', &['Ⓟ']), + ('ⓠ', &['Ⓠ']), + ('ⓡ', &['Ⓡ']), + ('ⓢ', &['Ⓢ']), + ('ⓣ', &['Ⓣ']), + ('ⓤ', &['Ⓤ']), + ('ⓥ', &['Ⓥ']), + ('ⓦ', &['Ⓦ']), + ('ⓧ', &['Ⓧ']), + ('ⓨ', &['Ⓨ']), + ('ⓩ', &['Ⓩ']), + ('Ⰰ', &['ⰰ']), + ('Ⰱ', &['ⰱ']), + ('Ⰲ', &['ⰲ']), + ('Ⰳ', &['ⰳ']), + ('Ⰴ', &['ⰴ']), + ('Ⰵ', &['ⰵ']), + ('Ⰶ', &['ⰶ']), + ('Ⰷ', &['ⰷ']), + ('Ⰸ', &['ⰸ']), + ('Ⰹ', &['ⰹ']), + ('Ⰺ', &['ⰺ']), + ('Ⰻ', &['ⰻ']), + ('Ⰼ', &['ⰼ']), + ('Ⰽ', &['ⰽ']), + ('Ⰾ', &['ⰾ']), + ('Ⰿ', &['ⰿ']), + ('Ⱀ', &['ⱀ']), + ('Ⱁ', &['ⱁ']), + ('Ⱂ', &['ⱂ']), + ('Ⱃ', &['ⱃ']), + ('Ⱄ', &['ⱄ']), + ('Ⱅ', &['ⱅ']), + ('Ⱆ', &['ⱆ']), + ('Ⱇ', &['ⱇ']), + ('Ⱈ', &['ⱈ']), + ('Ⱉ', &['ⱉ']), + ('Ⱊ', &['ⱊ']), + ('Ⱋ', &['ⱋ']), + ('Ⱌ', &['ⱌ']), + ('Ⱍ', &['ⱍ']), + ('Ⱎ', &['ⱎ']), + ('Ⱏ', &['ⱏ']), + ('Ⱐ', &['ⱐ']), + ('Ⱑ', &['ⱑ']), + ('Ⱒ', &['ⱒ']), + ('Ⱓ', &['ⱓ']), + ('Ⱔ', &['ⱔ']), + ('Ⱕ', &['ⱕ']), + ('Ⱖ', &['ⱖ']), + ('Ⱗ', &['ⱗ']), + ('Ⱘ', &['ⱘ']), + ('Ⱙ', &['ⱙ']), + ('Ⱚ', &['ⱚ']), + ('Ⱛ', &['ⱛ']), + ('Ⱜ', &['ⱜ']), + ('Ⱝ', &['ⱝ']), + ('Ⱞ', &['ⱞ']), + ('ⰰ', &['Ⰰ']), + ('ⰱ', &['Ⰱ']), + ('ⰲ', &['Ⰲ']), + ('ⰳ', &['Ⰳ']), + ('ⰴ', &['Ⰴ']), + ('ⰵ', &['Ⰵ']), + ('ⰶ', &['Ⰶ']), + ('ⰷ', &['Ⰷ']), + ('ⰸ', &['Ⰸ']), + ('ⰹ', &['Ⰹ']), + ('ⰺ', &['Ⰺ']), + ('ⰻ', &['Ⰻ']), + ('ⰼ', &['Ⰼ']), + ('ⰽ', &['Ⰽ']), + ('ⰾ', &['Ⰾ']), + ('ⰿ', &['Ⰿ']), + ('ⱀ', &['Ⱀ']), + ('ⱁ', &['Ⱁ']), + ('ⱂ', &['Ⱂ']), + ('ⱃ', &['Ⱃ']), + ('ⱄ', &['Ⱄ']), + ('ⱅ', &['Ⱅ']), + ('ⱆ', &['Ⱆ']), + ('ⱇ', &['Ⱇ']), + ('ⱈ', &['Ⱈ']), + ('ⱉ', &['Ⱉ']), + ('ⱊ', &['Ⱊ']), + ('ⱋ', &['Ⱋ']), + ('ⱌ', &['Ⱌ']), + ('ⱍ', &['Ⱍ']), + ('ⱎ', &['Ⱎ']), + ('ⱏ', &['Ⱏ']), + ('ⱐ', &['Ⱐ']), + ('ⱑ', &['Ⱑ']), + ('ⱒ', &['Ⱒ']), + ('ⱓ', &['Ⱓ']), + ('ⱔ', &['Ⱔ']), + ('ⱕ', &['Ⱕ']), + ('ⱖ', &['Ⱖ']), + ('ⱗ', &['Ⱗ']), + ('ⱘ', &['Ⱘ']), + ('ⱙ', &['Ⱙ']), + ('ⱚ', &['Ⱚ']), + ('ⱛ', &['Ⱛ']), + ('ⱜ', &['Ⱜ']), + ('ⱝ', &['Ⱝ']), + ('ⱞ', &['Ⱞ']), + ('Ⱡ', &['ⱡ']), + ('ⱡ', &['Ⱡ']), + ('Ɫ', &['ɫ']), + ('Ᵽ', &['ᵽ']), + ('Ɽ', &['ɽ']), + ('ⱥ', &['Ⱥ']), + ('ⱦ', &['Ⱦ']), + ('Ⱨ', &['ⱨ']), + ('ⱨ', &['Ⱨ']), + ('Ⱪ', &['ⱪ']), + ('ⱪ', &['Ⱪ']), + ('Ⱬ', &['ⱬ']), + ('ⱬ', &['Ⱬ']), + ('Ɑ', &['ɑ']), + ('Ɱ', &['ɱ']), + ('Ɐ', &['ɐ']), + ('Ɒ', &['ɒ']), + ('Ⱳ', &['ⱳ']), + ('ⱳ', &['Ⱳ']), + ('Ⱶ', &['ⱶ']), + ('ⱶ', &['Ⱶ']), + ('Ȿ', &['ȿ']), + ('Ɀ', &['ɀ']), + ('Ⲁ', &['ⲁ']), + ('ⲁ', &['Ⲁ']), + ('Ⲃ', &['ⲃ']), + ('ⲃ', &['Ⲃ']), + ('Ⲅ', &['ⲅ']), + ('ⲅ', &['Ⲅ']), + ('Ⲇ', &['ⲇ']), + ('ⲇ', &['Ⲇ']), + ('Ⲉ', &['ⲉ']), + ('ⲉ', &['Ⲉ']), + ('Ⲋ', &['ⲋ']), + ('ⲋ', &['Ⲋ']), + ('Ⲍ', &['ⲍ']), + ('ⲍ', &['Ⲍ']), + ('Ⲏ', &['ⲏ']), + ('ⲏ', &['Ⲏ']), + ('Ⲑ', &['ⲑ']), + ('ⲑ', &['Ⲑ']), + ('Ⲓ', &['ⲓ']), + ('ⲓ', &['Ⲓ']), + ('Ⲕ', &['ⲕ']), + ('ⲕ', &['Ⲕ']), + ('Ⲗ', &['ⲗ']), + ('ⲗ', &['Ⲗ']), + ('Ⲙ', &['ⲙ']), + ('ⲙ', &['Ⲙ']), + ('Ⲛ', &['ⲛ']), + ('ⲛ', &['Ⲛ']), + ('Ⲝ', &['ⲝ']), + ('ⲝ', &['Ⲝ']), + ('Ⲟ', &['ⲟ']), + ('ⲟ', &['Ⲟ']), + ('Ⲡ', &['ⲡ']), + ('ⲡ', &['Ⲡ']), + ('Ⲣ', &['ⲣ']), + ('ⲣ', &['Ⲣ']), + ('Ⲥ', &['ⲥ']), + ('ⲥ', &['Ⲥ']), + ('Ⲧ', &['ⲧ']), + ('ⲧ', &['Ⲧ']), + ('Ⲩ', &['ⲩ']), + ('ⲩ', &['Ⲩ']), + ('Ⲫ', &['ⲫ']), + ('ⲫ', &['Ⲫ']), + ('Ⲭ', &['ⲭ']), + ('ⲭ', &['Ⲭ']), + ('Ⲯ', &['ⲯ']), + ('ⲯ', &['Ⲯ']), + ('Ⲱ', &['ⲱ']), + ('ⲱ', &['Ⲱ']), + ('Ⲳ', &['ⲳ']), + ('ⲳ', &['Ⲳ']), + ('Ⲵ', &['ⲵ']), + ('ⲵ', &['Ⲵ']), + ('Ⲷ', &['ⲷ']), + ('ⲷ', &['Ⲷ']), + ('Ⲹ', &['ⲹ']), + ('ⲹ', &['Ⲹ']), + ('Ⲻ', &['ⲻ']), + ('ⲻ', &['Ⲻ']), + ('Ⲽ', &['ⲽ']), + ('ⲽ', &['Ⲽ']), + ('Ⲿ', &['ⲿ']), + ('ⲿ', &['Ⲿ']), + ('Ⳁ', &['ⳁ']), + ('ⳁ', &['Ⳁ']), + ('Ⳃ', &['ⳃ']), + ('ⳃ', &['Ⳃ']), + ('Ⳅ', &['ⳅ']), + ('ⳅ', &['Ⳅ']), + ('Ⳇ', &['ⳇ']), + ('ⳇ', &['Ⳇ']), + ('Ⳉ', &['ⳉ']), + ('ⳉ', &['Ⳉ']), + ('Ⳋ', &['ⳋ']), + ('ⳋ', &['Ⳋ']), + ('Ⳍ', &['ⳍ']), + ('ⳍ', &['Ⳍ']), + ('Ⳏ', &['ⳏ']), + ('ⳏ', &['Ⳏ']), + ('Ⳑ', &['ⳑ']), + ('ⳑ', &['Ⳑ']), + ('Ⳓ', &['ⳓ']), + ('ⳓ', &['Ⳓ']), + ('Ⳕ', &['ⳕ']), + ('ⳕ', &['Ⳕ']), + ('Ⳗ', &['ⳗ']), + ('ⳗ', &['Ⳗ']), + ('Ⳙ', &['ⳙ']), + ('ⳙ', &['Ⳙ']), + ('Ⳛ', &['ⳛ']), + ('ⳛ', &['Ⳛ']), + ('Ⳝ', &['ⳝ']), + ('ⳝ', &['Ⳝ']), + ('Ⳟ', &['ⳟ']), + ('ⳟ', &['Ⳟ']), + ('Ⳡ', &['ⳡ']), + ('ⳡ', &['Ⳡ']), + ('Ⳣ', &['ⳣ']), + ('ⳣ', &['Ⳣ']), + ('Ⳬ', &['ⳬ']), + ('ⳬ', &['Ⳬ']), + ('Ⳮ', &['ⳮ']), + ('ⳮ', &['Ⳮ']), + ('Ⳳ', &['ⳳ']), + ('ⳳ', &['Ⳳ']), + ('ⴀ', &['Ⴀ']), + ('ⴁ', &['Ⴁ']), + ('ⴂ', &['Ⴂ']), + ('ⴃ', &['Ⴃ']), + ('ⴄ', &['Ⴄ']), + ('ⴅ', &['Ⴅ']), + ('ⴆ', &['Ⴆ']), + ('ⴇ', &['Ⴇ']), + ('ⴈ', &['Ⴈ']), + ('ⴉ', &['Ⴉ']), + ('ⴊ', &['Ⴊ']), + ('ⴋ', &['Ⴋ']), + ('ⴌ', &['Ⴌ']), + ('ⴍ', &['Ⴍ']), + ('ⴎ', &['Ⴎ']), + ('ⴏ', &['Ⴏ']), + ('ⴐ', &['Ⴐ']), + ('ⴑ', &['Ⴑ']), + ('ⴒ', &['Ⴒ']), + ('ⴓ', &['Ⴓ']), + ('ⴔ', &['Ⴔ']), + ('ⴕ', &['Ⴕ']), + ('ⴖ', &['Ⴖ']), + ('ⴗ', &['Ⴗ']), + ('ⴘ', &['Ⴘ']), + ('ⴙ', &['Ⴙ']), + ('ⴚ', &['Ⴚ']), + ('ⴛ', &['Ⴛ']), + ('ⴜ', &['Ⴜ']), + ('ⴝ', &['Ⴝ']), + ('ⴞ', &['Ⴞ']), + ('ⴟ', &['Ⴟ']), + ('ⴠ', &['Ⴠ']), + ('ⴡ', &['Ⴡ']), + ('ⴢ', &['Ⴢ']), + ('ⴣ', &['Ⴣ']), + ('ⴤ', &['Ⴤ']), + ('ⴥ', &['Ⴥ']), + ('ⴧ', &['Ⴧ']), + ('ⴭ', &['Ⴭ']), + ('Ꙁ', &['ꙁ']), + ('ꙁ', &['Ꙁ']), + ('Ꙃ', &['ꙃ']), + ('ꙃ', &['Ꙃ']), + ('Ꙅ', &['ꙅ']), + ('ꙅ', &['Ꙅ']), + ('Ꙇ', &['ꙇ']), + ('ꙇ', &['Ꙇ']), + ('Ꙉ', &['ꙉ']), + ('ꙉ', &['Ꙉ']), + ('Ꙋ', &['ᲈ', 'ꙋ']), + ('ꙋ', &['ᲈ', 'Ꙋ']), + ('Ꙍ', &['ꙍ']), + ('ꙍ', &['Ꙍ']), + ('Ꙏ', &['ꙏ']), + ('ꙏ', &['Ꙏ']), + ('Ꙑ', &['ꙑ']), + ('ꙑ', &['Ꙑ']), + ('Ꙓ', &['ꙓ']), + ('ꙓ', &['Ꙓ']), + ('Ꙕ', &['ꙕ']), + ('ꙕ', &['Ꙕ']), + ('Ꙗ', &['ꙗ']), + ('ꙗ', &['Ꙗ']), + ('Ꙙ', &['ꙙ']), + ('ꙙ', &['Ꙙ']), + ('Ꙛ', &['ꙛ']), + ('ꙛ', &['Ꙛ']), + ('Ꙝ', &['ꙝ']), + ('ꙝ', &['Ꙝ']), + ('Ꙟ', &['ꙟ']), + ('ꙟ', &['Ꙟ']), + ('Ꙡ', &['ꙡ']), + ('ꙡ', &['Ꙡ']), + ('Ꙣ', &['ꙣ']), + ('ꙣ', &['Ꙣ']), + ('Ꙥ', &['ꙥ']), + ('ꙥ', &['Ꙥ']), + ('Ꙧ', &['ꙧ']), + ('ꙧ', &['Ꙧ']), + ('Ꙩ', &['ꙩ']), + ('ꙩ', &['Ꙩ']), + ('Ꙫ', &['ꙫ']), + ('ꙫ', &['Ꙫ']), + ('Ꙭ', &['ꙭ']), + ('ꙭ', &['Ꙭ']), + ('Ꚁ', &['ꚁ']), + ('ꚁ', &['Ꚁ']), + ('Ꚃ', &['ꚃ']), + ('ꚃ', &['Ꚃ']), + ('Ꚅ', &['ꚅ']), + ('ꚅ', &['Ꚅ']), + ('Ꚇ', &['ꚇ']), + ('ꚇ', &['Ꚇ']), + ('Ꚉ', &['ꚉ']), + ('ꚉ', &['Ꚉ']), + ('Ꚋ', &['ꚋ']), + ('ꚋ', &['Ꚋ']), + ('Ꚍ', &['ꚍ']), + ('ꚍ', &['Ꚍ']), + ('Ꚏ', &['ꚏ']), + ('ꚏ', &['Ꚏ']), + ('Ꚑ', &['ꚑ']), + ('ꚑ', &['Ꚑ']), + ('Ꚓ', &['ꚓ']), + ('ꚓ', &['Ꚓ']), + ('Ꚕ', &['ꚕ']), + ('ꚕ', &['Ꚕ']), + ('Ꚗ', &['ꚗ']), + ('ꚗ', &['Ꚗ']), + ('Ꚙ', &['ꚙ']), + ('ꚙ', &['Ꚙ']), + ('Ꚛ', &['ꚛ']), + ('ꚛ', &['Ꚛ']), + ('Ꜣ', &['ꜣ']), + ('ꜣ', &['Ꜣ']), + ('Ꜥ', &['ꜥ']), + ('ꜥ', &['Ꜥ']), + ('Ꜧ', &['ꜧ']), + ('ꜧ', &['Ꜧ']), + ('Ꜩ', &['ꜩ']), + ('ꜩ', &['Ꜩ']), + ('Ꜫ', &['ꜫ']), + ('ꜫ', &['Ꜫ']), + ('Ꜭ', &['ꜭ']), + ('ꜭ', &['Ꜭ']), + ('Ꜯ', &['ꜯ']), + ('ꜯ', &['Ꜯ']), + ('Ꜳ', &['ꜳ']), + ('ꜳ', &['Ꜳ']), + ('Ꜵ', &['ꜵ']), + ('ꜵ', &['Ꜵ']), + ('Ꜷ', &['ꜷ']), + ('ꜷ', &['Ꜷ']), + ('Ꜹ', &['ꜹ']), + ('ꜹ', &['Ꜹ']), + ('Ꜻ', &['ꜻ']), + ('ꜻ', &['Ꜻ']), + ('Ꜽ', &['ꜽ']), + ('ꜽ', &['Ꜽ']), + ('Ꜿ', &['ꜿ']), + ('ꜿ', &['Ꜿ']), + ('Ꝁ', &['ꝁ']), + ('ꝁ', &['Ꝁ']), + ('Ꝃ', &['ꝃ']), + ('ꝃ', &['Ꝃ']), + ('Ꝅ', &['ꝅ']), + ('ꝅ', &['Ꝅ']), + ('Ꝇ', &['ꝇ']), + ('ꝇ', &['Ꝇ']), + ('Ꝉ', &['ꝉ']), + ('ꝉ', &['Ꝉ']), + ('Ꝋ', &['ꝋ']), + ('ꝋ', &['Ꝋ']), + ('Ꝍ', &['ꝍ']), + ('ꝍ', &['Ꝍ']), + ('Ꝏ', &['ꝏ']), + ('ꝏ', &['Ꝏ']), + ('Ꝑ', &['ꝑ']), + ('ꝑ', &['Ꝑ']), + ('Ꝓ', &['ꝓ']), + ('ꝓ', &['Ꝓ']), + ('Ꝕ', &['ꝕ']), + ('ꝕ', &['Ꝕ']), + ('Ꝗ', &['ꝗ']), + ('ꝗ', &['Ꝗ']), + ('Ꝙ', &['ꝙ']), + ('ꝙ', &['Ꝙ']), + ('Ꝛ', &['ꝛ']), + ('ꝛ', &['Ꝛ']), + ('Ꝝ', &['ꝝ']), + ('ꝝ', &['Ꝝ']), + ('Ꝟ', &['ꝟ']), + ('ꝟ', &['Ꝟ']), + ('Ꝡ', &['ꝡ']), + ('ꝡ', &['Ꝡ']), + ('Ꝣ', &['ꝣ']), + ('ꝣ', &['Ꝣ']), + ('Ꝥ', &['ꝥ']), + ('ꝥ', &['Ꝥ']), + ('Ꝧ', &['ꝧ']), + ('ꝧ', &['Ꝧ']), + ('Ꝩ', &['ꝩ']), + ('ꝩ', &['Ꝩ']), + ('Ꝫ', &['ꝫ']), + ('ꝫ', &['Ꝫ']), + ('Ꝭ', &['ꝭ']), + ('ꝭ', &['Ꝭ']), + ('Ꝯ', &['ꝯ']), + ('ꝯ', &['Ꝯ']), + ('Ꝺ', &['ꝺ']), + ('ꝺ', &['Ꝺ']), + ('Ꝼ', &['ꝼ']), + ('ꝼ', &['Ꝼ']), + ('Ᵹ', &['ᵹ']), + ('Ꝿ', &['ꝿ']), + ('ꝿ', &['Ꝿ']), + ('Ꞁ', &['ꞁ']), + ('ꞁ', &['Ꞁ']), + ('Ꞃ', &['ꞃ']), + ('ꞃ', &['Ꞃ']), + ('Ꞅ', &['ꞅ']), + ('ꞅ', &['Ꞅ']), + ('Ꞇ', &['ꞇ']), + ('ꞇ', &['Ꞇ']), + ('Ꞌ', &['ꞌ']), + ('ꞌ', &['Ꞌ']), + ('Ɥ', &['ɥ']), + ('Ꞑ', &['ꞑ']), + ('ꞑ', &['Ꞑ']), + ('Ꞓ', &['ꞓ']), + ('ꞓ', &['Ꞓ']), + ('Ꞗ', &['ꞗ']), + ('ꞗ', &['Ꞗ']), + ('Ꞙ', &['ꞙ']), + ('ꞙ', &['Ꞙ']), + ('Ꞛ', &['ꞛ']), + ('ꞛ', &['Ꞛ']), + ('Ꞝ', &['ꞝ']), + ('ꞝ', &['Ꞝ']), + ('Ꞟ', &['ꞟ']), + ('ꞟ', &['Ꞟ']), + ('Ꞡ', &['ꞡ']), + ('ꞡ', &['Ꞡ']), + ('Ꞣ', &['ꞣ']), + ('ꞣ', &['Ꞣ']), + ('Ꞥ', &['ꞥ']), + ('ꞥ', &['Ꞥ']), + ('Ꞧ', &['ꞧ']), + ('ꞧ', &['Ꞧ']), + ('Ꞩ', &['ꞩ']), + ('ꞩ', &['Ꞩ']), + ('Ɦ', &['ɦ']), + ('Ɜ', &['ɜ']), + ('Ɡ', &['ɡ']), + ('Ɬ', &['ɬ']), + ('Ɪ', &['ɪ']), + ('Ʞ', &['ʞ']), + ('Ʇ', &['ʇ']), + ('Ʝ', &['ʝ']), + ('Ꭓ', &['ꭓ']), + ('Ꞵ', &['ꞵ']), + ('ꞵ', &['Ꞵ']), + ('Ꞷ', &['ꞷ']), + ('ꞷ', &['Ꞷ']), + ('ꭓ', &['Ꭓ']), + ('ꭰ', &['Ꭰ']), + ('ꭱ', &['Ꭱ']), + ('ꭲ', &['Ꭲ']), + ('ꭳ', &['Ꭳ']), + ('ꭴ', &['Ꭴ']), + ('ꭵ', &['Ꭵ']), + ('ꭶ', &['Ꭶ']), + ('ꭷ', &['Ꭷ']), + ('ꭸ', &['Ꭸ']), + ('ꭹ', &['Ꭹ']), + ('ꭺ', &['Ꭺ']), + ('ꭻ', &['Ꭻ']), + ('ꭼ', &['Ꭼ']), + ('ꭽ', &['Ꭽ']), + ('ꭾ', &['Ꭾ']), + ('ꭿ', &['Ꭿ']), + ('ꮀ', &['Ꮀ']), + ('ꮁ', &['Ꮁ']), + ('ꮂ', &['Ꮂ']), + ('ꮃ', &['Ꮃ']), + ('ꮄ', &['Ꮄ']), + ('ꮅ', &['Ꮅ']), + ('ꮆ', &['Ꮆ']), + ('ꮇ', &['Ꮇ']), + ('ꮈ', &['Ꮈ']), + ('ꮉ', &['Ꮉ']), + ('ꮊ', &['Ꮊ']), + ('ꮋ', &['Ꮋ']), + ('ꮌ', &['Ꮌ']), + ('ꮍ', &['Ꮍ']), + ('ꮎ', &['Ꮎ']), + ('ꮏ', &['Ꮏ']), + ('ꮐ', &['Ꮐ']), + ('ꮑ', &['Ꮑ']), + ('ꮒ', &['Ꮒ']), + ('ꮓ', &['Ꮓ']), + ('ꮔ', &['Ꮔ']), + ('ꮕ', &['Ꮕ']), + ('ꮖ', &['Ꮖ']), + ('ꮗ', &['Ꮗ']), + ('ꮘ', &['Ꮘ']), + ('ꮙ', &['Ꮙ']), + ('ꮚ', &['Ꮚ']), + ('ꮛ', &['Ꮛ']), + ('ꮜ', &['Ꮜ']), + ('ꮝ', &['Ꮝ']), + ('ꮞ', &['Ꮞ']), + ('ꮟ', &['Ꮟ']), + ('ꮠ', &['Ꮠ']), + ('ꮡ', &['Ꮡ']), + ('ꮢ', &['Ꮢ']), + ('ꮣ', &['Ꮣ']), + ('ꮤ', &['Ꮤ']), + ('ꮥ', &['Ꮥ']), + ('ꮦ', &['Ꮦ']), + ('ꮧ', &['Ꮧ']), + ('ꮨ', &['Ꮨ']), + ('ꮩ', &['Ꮩ']), + ('ꮪ', &['Ꮪ']), + ('ꮫ', &['Ꮫ']), + ('ꮬ', &['Ꮬ']), + ('ꮭ', &['Ꮭ']), + ('ꮮ', &['Ꮮ']), + ('ꮯ', &['Ꮯ']), + ('ꮰ', &['Ꮰ']), + ('ꮱ', &['Ꮱ']), + ('ꮲ', &['Ꮲ']), + ('ꮳ', &['Ꮳ']), + ('ꮴ', &['Ꮴ']), + ('ꮵ', &['Ꮵ']), + ('ꮶ', &['Ꮶ']), + ('ꮷ', &['Ꮷ']), + ('ꮸ', &['Ꮸ']), + ('ꮹ', &['Ꮹ']), + ('ꮺ', &['Ꮺ']), + ('ꮻ', &['Ꮻ']), + ('ꮼ', &['Ꮼ']), + ('ꮽ', &['Ꮽ']), + ('ꮾ', &['Ꮾ']), + ('ꮿ', &['Ꮿ']), + ('A', &['a']), + ('B', &['b']), + ('C', &['c']), + ('D', &['d']), + ('E', &['e']), + ('F', &['f']), + ('G', &['g']), + ('H', &['h']), + ('I', &['i']), + ('J', &['j']), + ('K', &['k']), + ('L', &['l']), + ('M', &['m']), + ('N', &['n']), + ('O', &['o']), + ('P', &['p']), + ('Q', &['q']), + ('R', &['r']), + ('S', &['s']), + ('T', &['t']), + ('U', &['u']), + ('V', &['v']), + ('W', &['w']), + ('X', &['x']), + ('Y', &['y']), + ('Z', &['z']), + ('a', &['A']), + ('b', &['B']), + ('c', &['C']), + ('d', &['D']), + ('e', &['E']), + ('f', &['F']), + ('g', &['G']), + ('h', &['H']), + ('i', &['I']), + ('j', &['J']), + ('k', &['K']), + ('l', &['L']), + ('m', &['M']), + ('n', &['N']), + ('o', &['O']), + ('p', &['P']), + ('q', &['Q']), + ('r', &['R']), + ('s', &['S']), + ('t', &['T']), + ('u', &['U']), + ('v', &['V']), + ('w', &['W']), + ('x', &['X']), + ('y', &['Y']), + ('z', &['Z']), + ('𐐀', &['𐐨']), + ('𐐁', &['𐐩']), + ('𐐂', &['𐐪']), + ('𐐃', &['𐐫']), + ('𐐄', &['𐐬']), + ('𐐅', &['𐐭']), + ('𐐆', &['𐐮']), + ('𐐇', &['𐐯']), + ('𐐈', &['𐐰']), + ('𐐉', &['𐐱']), + ('𐐊', &['𐐲']), + ('𐐋', &['𐐳']), + ('𐐌', &['𐐴']), + ('𐐍', &['𐐵']), + ('𐐎', &['𐐶']), + ('𐐏', &['𐐷']), + ('𐐐', &['𐐸']), + ('𐐑', &['𐐹']), + ('𐐒', &['𐐺']), + ('𐐓', &['𐐻']), + ('𐐔', &['𐐼']), + ('𐐕', &['𐐽']), + ('𐐖', &['𐐾']), + ('𐐗', &['𐐿']), + ('𐐘', &['𐑀']), + ('𐐙', &['𐑁']), + ('𐐚', &['𐑂']), + ('𐐛', &['𐑃']), + ('𐐜', &['𐑄']), + ('𐐝', &['𐑅']), + ('𐐞', &['𐑆']), + ('𐐟', &['𐑇']), + ('𐐠', &['𐑈']), + ('𐐡', &['𐑉']), + ('𐐢', &['𐑊']), + ('𐐣', &['𐑋']), + ('𐐤', &['𐑌']), + ('𐐥', &['𐑍']), + ('𐐦', &['𐑎']), + ('𐐧', &['𐑏']), + ('𐐨', &['𐐀']), + ('𐐩', &['𐐁']), + ('𐐪', &['𐐂']), + ('𐐫', &['𐐃']), + ('𐐬', &['𐐄']), + ('𐐭', &['𐐅']), + ('𐐮', &['𐐆']), + ('𐐯', &['𐐇']), + ('𐐰', &['𐐈']), + ('𐐱', &['𐐉']), + ('𐐲', &['𐐊']), + ('𐐳', &['𐐋']), + ('𐐴', &['𐐌']), + ('𐐵', &['𐐍']), + ('𐐶', &['𐐎']), + ('𐐷', &['𐐏']), + ('𐐸', &['𐐐']), + ('𐐹', &['𐐑']), + ('𐐺', &['𐐒']), + ('𐐻', &['𐐓']), + ('𐐼', &['𐐔']), + ('𐐽', &['𐐕']), + ('𐐾', &['𐐖']), + ('𐐿', &['𐐗']), + ('𐑀', &['𐐘']), + ('𐑁', &['𐐙']), + ('𐑂', &['𐐚']), + ('𐑃', &['𐐛']), + ('𐑄', &['𐐜']), + ('𐑅', &['𐐝']), + ('𐑆', &['𐐞']), + ('𐑇', &['𐐟']), + ('𐑈', &['𐐠']), + ('𐑉', &['𐐡']), + ('𐑊', &['𐐢']), + ('𐑋', &['𐐣']), + ('𐑌', &['𐐤']), + ('𐑍', &['𐐥']), + ('𐑎', &['𐐦']), + ('𐑏', &['𐐧']), + ('𐒰', &['𐓘']), + ('𐒱', &['𐓙']), + ('𐒲', &['𐓚']), + ('𐒳', &['𐓛']), + ('𐒴', &['𐓜']), + ('𐒵', &['𐓝']), + ('𐒶', &['𐓞']), + ('𐒷', &['𐓟']), + ('𐒸', &['𐓠']), + ('𐒹', &['𐓡']), + ('𐒺', &['𐓢']), + ('𐒻', &['𐓣']), + ('𐒼', &['𐓤']), + ('𐒽', &['𐓥']), + ('𐒾', &['𐓦']), + ('𐒿', &['𐓧']), + ('𐓀', &['𐓨']), + ('𐓁', &['𐓩']), + ('𐓂', &['𐓪']), + ('𐓃', &['𐓫']), + ('𐓄', &['𐓬']), + ('𐓅', &['𐓭']), + ('𐓆', &['𐓮']), + ('𐓇', &['𐓯']), + ('𐓈', &['𐓰']), + ('𐓉', &['𐓱']), + ('𐓊', &['𐓲']), + ('𐓋', &['𐓳']), + ('𐓌', &['𐓴']), + ('𐓍', &['𐓵']), + ('𐓎', &['𐓶']), + ('𐓏', &['𐓷']), + ('𐓐', &['𐓸']), + ('𐓑', &['𐓹']), + ('𐓒', &['𐓺']), + ('𐓓', &['𐓻']), + ('𐓘', &['𐒰']), + ('𐓙', &['𐒱']), + ('𐓚', &['𐒲']), + ('𐓛', &['𐒳']), + ('𐓜', &['𐒴']), + ('𐓝', &['𐒵']), + ('𐓞', &['𐒶']), + ('𐓟', &['𐒷']), + ('𐓠', &['𐒸']), + ('𐓡', &['𐒹']), + ('𐓢', &['𐒺']), + ('𐓣', &['𐒻']), + ('𐓤', &['𐒼']), + ('𐓥', &['𐒽']), + ('𐓦', &['𐒾']), + ('𐓧', &['𐒿']), + ('𐓨', &['𐓀']), + ('𐓩', &['𐓁']), + ('𐓪', &['𐓂']), + ('𐓫', &['𐓃']), + ('𐓬', &['𐓄']), + ('𐓭', &['𐓅']), + ('𐓮', &['𐓆']), + ('𐓯', &['𐓇']), + ('𐓰', &['𐓈']), + ('𐓱', &['𐓉']), + ('𐓲', &['𐓊']), + ('𐓳', &['𐓋']), + ('𐓴', &['𐓌']), + ('𐓵', &['𐓍']), + ('𐓶', &['𐓎']), + ('𐓷', &['𐓏']), + ('𐓸', &['𐓐']), + ('𐓹', &['𐓑']), + ('𐓺', &['𐓒']), + ('𐓻', &['𐓓']), + ('𐲀', &['𐳀']), + ('𐲁', &['𐳁']), + ('𐲂', &['𐳂']), + ('𐲃', &['𐳃']), + ('𐲄', &['𐳄']), + ('𐲅', &['𐳅']), + ('𐲆', &['𐳆']), + ('𐲇', &['𐳇']), + ('𐲈', &['𐳈']), + ('𐲉', &['𐳉']), + ('𐲊', &['𐳊']), + ('𐲋', &['𐳋']), + ('𐲌', &['𐳌']), + ('𐲍', &['𐳍']), + ('𐲎', &['𐳎']), + ('𐲏', &['𐳏']), + ('𐲐', &['𐳐']), + ('𐲑', &['𐳑']), + ('𐲒', &['𐳒']), + ('𐲓', &['𐳓']), + ('𐲔', &['𐳔']), + ('𐲕', &['𐳕']), + ('𐲖', &['𐳖']), + ('𐲗', &['𐳗']), + ('𐲘', &['𐳘']), + ('𐲙', &['𐳙']), + ('𐲚', &['𐳚']), + ('𐲛', &['𐳛']), + ('𐲜', &['𐳜']), + ('𐲝', &['𐳝']), + ('𐲞', &['𐳞']), + ('𐲟', &['𐳟']), + ('𐲠', &['𐳠']), + ('𐲡', &['𐳡']), + ('𐲢', &['𐳢']), + ('𐲣', &['𐳣']), + ('𐲤', &['𐳤']), + ('𐲥', &['𐳥']), + ('𐲦', &['𐳦']), + ('𐲧', &['𐳧']), + ('𐲨', &['𐳨']), + ('𐲩', &['𐳩']), + ('𐲪', &['𐳪']), + ('𐲫', &['𐳫']), + ('𐲬', &['𐳬']), + ('𐲭', &['𐳭']), + ('𐲮', &['𐳮']), + ('𐲯', &['𐳯']), + ('𐲰', &['𐳰']), + ('𐲱', &['𐳱']), + ('𐲲', &['𐳲']), + ('𐳀', &['𐲀']), + ('𐳁', &['𐲁']), + ('𐳂', &['𐲂']), + ('𐳃', &['𐲃']), + ('𐳄', &['𐲄']), + ('𐳅', &['𐲅']), + ('𐳆', &['𐲆']), + ('𐳇', &['𐲇']), + ('𐳈', &['𐲈']), + ('𐳉', &['𐲉']), + ('𐳊', &['𐲊']), + ('𐳋', &['𐲋']), + ('𐳌', &['𐲌']), + ('𐳍', &['𐲍']), + ('𐳎', &['𐲎']), + ('𐳏', &['𐲏']), + ('𐳐', &['𐲐']), + ('𐳑', &['𐲑']), + ('𐳒', &['𐲒']), + ('𐳓', &['𐲓']), + ('𐳔', &['𐲔']), + ('𐳕', &['𐲕']), + ('𐳖', &['𐲖']), + ('𐳗', &['𐲗']), + ('𐳘', &['𐲘']), + ('𐳙', &['𐲙']), + ('𐳚', &['𐲚']), + ('𐳛', &['𐲛']), + ('𐳜', &['𐲜']), + ('𐳝', &['𐲝']), + ('𐳞', &['𐲞']), + ('𐳟', &['𐲟']), + ('𐳠', &['𐲠']), + ('𐳡', &['𐲡']), + ('𐳢', &['𐲢']), + ('𐳣', &['𐲣']), + ('𐳤', &['𐲤']), + ('𐳥', &['𐲥']), + ('𐳦', &['𐲦']), + ('𐳧', &['𐲧']), + ('𐳨', &['𐲨']), + ('𐳩', &['𐲩']), + ('𐳪', &['𐲪']), + ('𐳫', &['𐲫']), + ('𐳬', &['𐲬']), + ('𐳭', &['𐲭']), + ('𐳮', &['𐲮']), + ('𐳯', &['𐲯']), + ('𐳰', &['𐲰']), + ('𐳱', &['𐲱']), + ('𐳲', &['𐲲']), + ('𑢠', &['𑣀']), + ('𑢡', &['𑣁']), + ('𑢢', &['𑣂']), + ('𑢣', &['𑣃']), + ('𑢤', &['𑣄']), + ('𑢥', &['𑣅']), + ('𑢦', &['𑣆']), + ('𑢧', &['𑣇']), + ('𑢨', &['𑣈']), + ('𑢩', &['𑣉']), + ('𑢪', &['𑣊']), + ('𑢫', &['𑣋']), + ('𑢬', &['𑣌']), + ('𑢭', &['𑣍']), + ('𑢮', &['𑣎']), + ('𑢯', &['𑣏']), + ('𑢰', &['𑣐']), + ('𑢱', &['𑣑']), + ('𑢲', &['𑣒']), + ('𑢳', &['𑣓']), + ('𑢴', &['𑣔']), + ('𑢵', &['𑣕']), + ('𑢶', &['𑣖']), + ('𑢷', &['𑣗']), + ('𑢸', &['𑣘']), + ('𑢹', &['𑣙']), + ('𑢺', &['𑣚']), + ('𑢻', &['𑣛']), + ('𑢼', &['𑣜']), + ('𑢽', &['𑣝']), + ('𑢾', &['𑣞']), + ('𑢿', &['𑣟']), + ('𑣀', &['𑢠']), + ('𑣁', &['𑢡']), + ('𑣂', &['𑢢']), + ('𑣃', &['𑢣']), + ('𑣄', &['𑢤']), + ('𑣅', &['𑢥']), + ('𑣆', &['𑢦']), + ('𑣇', &['𑢧']), + ('𑣈', &['𑢨']), + ('𑣉', &['𑢩']), + ('𑣊', &['𑢪']), + ('𑣋', &['𑢫']), + ('𑣌', &['𑢬']), + ('𑣍', &['𑢭']), + ('𑣎', &['𑢮']), + ('𑣏', &['𑢯']), + ('𑣐', &['𑢰']), + ('𑣑', &['𑢱']), + ('𑣒', &['𑢲']), + ('𑣓', &['𑢳']), + ('𑣔', &['𑢴']), + ('𑣕', &['𑢵']), + ('𑣖', &['𑢶']), + ('𑣗', &['𑢷']), + ('𑣘', &['𑢸']), + ('𑣙', &['𑢹']), + ('𑣚', &['𑢺']), + ('𑣛', &['𑢻']), + ('𑣜', &['𑢼']), + ('𑣝', &['𑢽']), + ('𑣞', &['𑢾']), + ('𑣟', &['𑢿']), + ('𞤀', &['𞤢']), + ('𞤁', &['𞤣']), + ('𞤂', &['𞤤']), + ('𞤃', &['𞤥']), + ('𞤄', &['𞤦']), + ('𞤅', &['𞤧']), + ('𞤆', &['𞤨']), + ('𞤇', &['𞤩']), + ('𞤈', &['𞤪']), + ('𞤉', &['𞤫']), + ('𞤊', &['𞤬']), + ('𞤋', &['𞤭']), + ('𞤌', &['𞤮']), + ('𞤍', &['𞤯']), + ('𞤎', &['𞤰']), + ('𞤏', &['𞤱']), + ('𞤐', &['𞤲']), + ('𞤑', &['𞤳']), + ('𞤒', &['𞤴']), + ('𞤓', &['𞤵']), + ('𞤔', &['𞤶']), + ('𞤕', &['𞤷']), + ('𞤖', &['𞤸']), + ('𞤗', &['𞤹']), + ('𞤘', &['𞤺']), + ('𞤙', &['𞤻']), + ('𞤚', &['𞤼']), + ('𞤛', &['𞤽']), + ('𞤜', &['𞤾']), + ('𞤝', &['𞤿']), + ('𞤞', &['𞥀']), + ('𞤟', &['𞥁']), + ('𞤠', &['𞥂']), + ('𞤡', &['𞥃']), + ('𞤢', &['𞤀']), + ('𞤣', &['𞤁']), + ('𞤤', &['𞤂']), + ('𞤥', &['𞤃']), + ('𞤦', &['𞤄']), + ('𞤧', &['𞤅']), + ('𞤨', &['𞤆']), + ('𞤩', &['𞤇']), + ('𞤪', &['𞤈']), + ('𞤫', &['𞤉']), + ('𞤬', &['𞤊']), + ('𞤭', &['𞤋']), + ('𞤮', &['𞤌']), + ('𞤯', &['𞤍']), + ('𞤰', &['𞤎']), + ('𞤱', &['𞤏']), + ('𞤲', &['𞤐']), + ('𞤳', &['𞤑']), + ('𞤴', &['𞤒']), + ('𞤵', &['𞤓']), + ('𞤶', &['𞤔']), + ('𞤷', &['𞤕']), + ('𞤸', &['𞤖']), + ('𞤹', &['𞤗']), + ('𞤺', &['𞤘']), + ('𞤻', &['𞤙']), + ('𞤼', &['𞤚']), + ('𞤽', &['𞤛']), + ('𞤾', &['𞤜']), + ('𞤿', &['𞤝']), + ('𞥀', &['𞤞']), + ('𞥁', &['𞤟']), + ('𞥂', &['𞤠']), + ('𞥃', &['𞤡']), +]; diff --git a/parse_wiki_text/src/character_entity.rs b/parse_wiki_text/src/character_entity.rs new file mode 100644 index 0000000..018efa0 --- /dev/null +++ b/parse_wiki_text/src/character_entity.rs @@ -0,0 +1,22 @@ +// Copyright 2019 Fredrik Portström <https://portstrom.com> +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +pub fn parse_character_entity(state: &mut crate::State, configuration: &crate::Configuration) { + if let Ok((match_length, character)) = configuration + .character_entities + .find(&state.wiki_text[state.scan_position + 1..]) + { + let start_position = state.scan_position; + state.flush(start_position); + state.flushed_position = match_length + start_position + 1; + state.scan_position = state.flushed_position; + state.nodes.push(crate::Node::CharacterEntity { + character, + end: state.scan_position, + start: start_position, + }); + } else { + state.scan_position += 1; + } +} diff --git a/parse_wiki_text/src/comment.rs b/parse_wiki_text/src/comment.rs new file mode 100644 index 0000000..d933c17 --- /dev/null +++ b/parse_wiki_text/src/comment.rs @@ -0,0 +1,109 @@ +// Copyright 2019 Fredrik Portström <https://portstrom.com> +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +pub fn parse_comment(state: &mut crate::State) { + let start_position = state.scan_position; + let mut position = start_position; + state.flush(position); + position += 4; + while let Some(character) = state.get_byte(position) { + match character { + b'-' if state.get_byte(position + 1) == Some(b'-') + && state.get_byte(position + 2) == Some(b'>') => + { + position += 3; + break; + } + b'<' if state.get_byte(position + 1) == Some(b'/') => { + if parse_end_tag(state, start_position, position) { + return; + } + position += 2; + continue; + } + _ => {} + } + position += 1; + } + state.flushed_position = position; + state.scan_position = position; + state.nodes.push(crate::Node::Comment { + end: state.scan_position, + start: start_position, + }); +} + +fn parse_end_tag( + state: &mut crate::State, + comment_start_position: usize, + tag_start_position: usize, +) -> bool { + let tag_name_start_position = tag_start_position + 2; + let mut tag_name_end_position = tag_name_start_position; + let tag_end_position = loop { + match state.get_byte(tag_name_end_position) { + None | Some(b'/') | Some(b'<') => return false, + Some(b'\t') | Some(b'\n') | Some(b' ') => { + let tag_end_position = state.skip_whitespace_forwards(tag_name_end_position + 1); + match state.get_byte(tag_end_position) { + Some(b'>') => break tag_end_position, + _ => return false, + } + } + Some(b'>') => break tag_name_end_position, + _ => tag_name_end_position += 1, + } + } + 1; + let tag_name = &state.wiki_text[tag_name_start_position..tag_name_end_position]; + let tag_name = if tag_name.as_bytes().iter().all(u8::is_ascii_lowercase) { + crate::Cow::Borrowed(tag_name) + } else { + tag_name.to_ascii_lowercase().into() + }; + let mut matched_node_index = None; + for (open_node_index, open_node) in state.stack.iter().enumerate().rev() { + if let crate::OpenNodeType::Tag { name, .. } = &open_node.type_ { + if name == &tag_name { + matched_node_index = Some(open_node_index); + break; + } + } + } + match matched_node_index { + None => false, + Some(open_node_index) => { + if open_node_index < state.stack.len() - 1 { + state.warnings.push(crate::Warning { + end: tag_end_position, + message: crate::WarningMessage::MissingEndTagRewinding, + start: tag_start_position, + }); + state.stack.truncate(open_node_index + 2); + let open_node = state.stack.pop().unwrap(); + state.rewind(open_node.nodes, open_node.start); + } else { + state.warnings.push(crate::Warning { + end: tag_end_position, + message: crate::WarningMessage::EndTagInComment, + start: tag_start_position, + }); + state.nodes.push(crate::Node::Comment { + end: tag_start_position, + start: comment_start_position, + }); + let open_node = state.stack.pop().unwrap(); + state.flushed_position = tag_end_position; + state.scan_position = state.flushed_position; + let nodes = std::mem::replace(&mut state.nodes, open_node.nodes); + state.nodes.push(crate::Node::Tag { + end: state.scan_position, + name: tag_name, + nodes, + start: open_node.start, + }); + } + true + } + } +} diff --git a/parse_wiki_text/src/configuration.rs b/parse_wiki_text/src/configuration.rs new file mode 100644 index 0000000..875a69b --- /dev/null +++ b/parse_wiki_text/src/configuration.rs @@ -0,0 +1,164 @@ +// Copyright 2019 Fredrik Portström <https://portstrom.com> +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +/// Site specific configuration of a wiki. +/// +/// This is generated using the program [`fetch_mediawiki_configuration`](https://github.com/portstrom/fetch_mediawiki_configuration). +pub struct ConfigurationSource<'a> { + /// Aliases of the category namespace. + pub category_namespaces: &'a [&'a str], + + /// Tag names of extension tags. + pub extension_tags: &'a [&'a str], + + /// Aliases of the file namespace. + pub file_namespaces: &'a [&'a str], + + /// Characters that can appear in link trails. + pub link_trail: &'a str, + + /// Magic words that can appear between `__` and `__`. + pub magic_words: &'a [&'a str], + + /// Protocols that can be used for external links. + pub protocols: &'a [&'a str], + + /// Magic words that can be used for redirects. + pub redirect_magic_words: &'a [&'a str], +} + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub enum Namespace { + Category, + File, +} + +impl crate::Configuration { + /// Allocates and returns a new configuration based on the given site specific configuration. + #[must_use] + pub fn new(source: &ConfigurationSource) -> Self { + let mut configuration = crate::Configuration { + character_entities: crate::Trie::new(), + link_trail_character_set: crate::HashSet::new(), + magic_words: crate::Trie::new(), + namespaces: crate::Trie::new(), + protocols: crate::Trie::new(), + redirect_magic_words: crate::Trie::new(), + tag_name_map: crate::HashMap::new(), + }; + for (name, character) in crate::html_entities::HTML_ENTITIES { + configuration + .character_entities + .add_case_sensitive_term(&format!("{};", name), *character); + } + for character in source.link_trail.chars() { + configuration.link_trail_character_set.insert(character); + } + for protocol in source.protocols { + configuration.protocols.add_term(protocol, ()); + } + for magic_word in source.magic_words { + configuration.magic_words.add_term(magic_word, ()); + } + for namespace in source.category_namespaces { + configuration + .namespaces + .add_term(&format!("{}:", namespace), Namespace::Category); + } + for namespace in source.file_namespaces { + configuration + .namespaces + .add_term(&format!("{}:", namespace), Namespace::File); + } + for redirect_magic_word in source.redirect_magic_words { + configuration + .redirect_magic_words + .add_term(redirect_magic_word, ()); + } + for tag_name in source.extension_tags { + configuration + .tag_name_map + .insert(tag_name.to_string(), crate::TagClass::ExtensionTag); + } + for tag_name in [ + "abbr", + "b", + "bdi", + "bdo", + "blockquote", + "br", + "caption", + "center", + "cite", + "code", + "data", + "dd", + "del", + "dfn", + "div", + "dl", + "dt", + "em", + "font", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "hr", + "i", + "ins", + "kbd", + "li", + "mark", + "ol", + "p", + "pre", + "q", + "rb", + "rp", + "rt", + "ruby", + "s", + "samp", + "small", + "span", + "strike", + "strong", + "sub", + "sup", + "table", + "td", + "th", + "time", + "tr", + "tt", + "u", + "ul", + "var", + "wbr", + ] + .iter() + { + configuration + .tag_name_map + .insert(tag_name.to_string(), crate::TagClass::Tag); + } + configuration + } + + /// Parses wiki text into structured data. + #[must_use] + pub fn parse<'a>(&self, wiki_text: &'a str) -> crate::Output<'a> { + crate::parse::parse(self, wiki_text) + } +} + +impl Default for crate::Configuration { + /// Allocates and returns a configuration suitable for testing and quick and dirty prototyping. For correctly parsing an actual wiki, please get the correct site configuration for that particular wiki. + fn default() -> Self { + crate::default::create_configuration() + } +} diff --git a/parse_wiki_text/src/default.rs b/parse_wiki_text/src/default.rs new file mode 100644 index 0000000..50110e4 --- /dev/null +++ b/parse_wiki_text/src/default.rs @@ -0,0 +1,88 @@ +// Copyright 2019 Fredrik Portström <https://portstrom.com> +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +pub fn create_configuration() -> crate::Configuration { + crate::Configuration::new(&crate::ConfigurationSource { + category_namespaces: &["category"], + extension_tags: &[ + "categorytree", + "ce", + "charinsert", + "chem", + "gallery", + "graph", + "hiero", + "imagemap", + "indicator", + "inputbox", + "mapframe", + "maplink", + "math", + "nowiki", + "poem", + "pre", + "ref", + "references", + "score", + "section", + "source", + "syntaxhighlight", + "templatedata", + "timeline", + ], + file_namespaces: &["file", "image"], + link_trail: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", + magic_words: &[ + "DISAMBIG", + "FORCETOC", + "HIDDENCAT", + "INDEX", + "NEWSECTIONLINK", + "NOCC", + "NOCOLLABORATIONHUBTOC", + "NOCONTENTCONVERT", + "NOEDITSECTION", + "NOGALLERY", + "NOGLOBAL", + "NOINDEX", + "NONEWSECTIONLINK", + "NOTC", + "NOTITLECONVERT", + "NOTOC", + "STATICREDIRECT", + "TOC", + ], + protocols: &[ + "//", + "bitcoin:", + "ftp://", + "ftps://", + "geo:", + "git://", + "gopher://", + "http://", + "https://", + "irc://", + "ircs://", + "magnet:", + "mailto:", + "mms://", + "news:", + "nntp://", + "redis://", + "sftp://", + "sip:", + "sips:", + "sms:", + "ssh://", + "svn://", + "tel:", + "telnet://", + "urn:", + "worldwind://", + "xmpp:", + ], + redirect_magic_words: &["REDIRECT"], + }) +} diff --git a/parse_wiki_text/src/external_link.rs b/parse_wiki_text/src/external_link.rs new file mode 100644 index 0000000..ce30f87 --- /dev/null +++ b/parse_wiki_text/src/external_link.rs @@ -0,0 +1,47 @@ +// Copyright 2019 Fredrik Portström <https://portstrom.com> +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +pub fn parse_external_link_end<'a>( + state: &mut crate::State<'a>, + start_position: usize, + nodes: Vec<crate::Node<'a>>, +) { + let scan_position = state.scan_position; + state.flush(scan_position); + state.scan_position += 1; + state.flushed_position = state.scan_position; + let nodes = std::mem::replace(&mut state.nodes, nodes); + state.nodes.push(crate::Node::ExternalLink { + end: state.scan_position, + nodes, + start: start_position, + }); +} + +pub fn parse_external_link_end_of_line(state: &mut crate::State) { + let end = state.scan_position; + let open_node = state.stack.pop().unwrap(); + state.warnings.push(crate::Warning { + end, + message: crate::WarningMessage::InvalidLinkSyntax, + start: open_node.start, + }); + state.rewind(open_node.nodes, open_node.start); +} + +pub fn parse_external_link_start(state: &mut crate::State, configuration: &crate::Configuration) { + let scheme_start_position = state.scan_position + 1; + match configuration + .protocols + .find(&state.wiki_text[scheme_start_position..]) + { + Err(_) => { + state.scan_position = scheme_start_position; + return; + } + Ok(_) => { + state.push_open_node(crate::OpenNodeType::ExternalLink, scheme_start_position); + } + } +} diff --git a/parse_wiki_text/src/heading.rs b/parse_wiki_text/src/heading.rs new file mode 100644 index 0000000..9c4d647 --- /dev/null +++ b/parse_wiki_text/src/heading.rs @@ -0,0 +1,88 @@ +// Copyright 2019 Fredrik Portström <https://portstrom.com> +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +pub fn parse_heading_end(state: &mut crate::State) { + let mut end_position = state.scan_position; + loop { + match state.get_byte(end_position - 1) { + Some(b'\t') | Some(b' ') => end_position -= 1, + _ => break, + } + } + let open_node = state.stack.pop().unwrap(); + if state.get_byte(end_position - 1) != Some(b'=') || end_position < open_node.start + 3 { + state.warnings.push(crate::Warning { + end: end_position, + message: crate::WarningMessage::InvalidHeadingSyntaxRewinding, + start: open_node.start, + }); + state.rewind(open_node.nodes, open_node.start); + return; + } + let start_level = match open_node.type_ { + crate::OpenNodeType::Heading { level } => level, + _ => unreachable!(), + }; + let mut end_level: u8 = 1; + while end_level < start_level + && end_position - end_level as usize > open_node.start + end_level as usize + 2 + && state.get_byte(end_position - end_level as usize - 1) == Some(b'=') + { + end_level += 1; + } + let position = state.skip_whitespace_backwards(end_position - end_level as usize); + if end_level < start_level { + state.warnings.push(crate::Warning { + end: end_position, + message: crate::WarningMessage::UnexpectedHeadingLevelCorrecting, + start: open_node.start, + }); + let inner_start_position = open_node.start + end_level as usize; + if match state.nodes.get_mut(0) { + None => { + state.flushed_position = inner_start_position; + false + } + Some(crate::Node::Text { end, start, value }) => { + *start = inner_start_position; + *value = &state.wiki_text[inner_start_position..*end]; + false + } + Some(_) => true, + } { + let end = state.skip_whitespace_forwards(open_node.start + start_level as usize); + state.nodes.insert( + 0, + crate::Node::Text { + end, + start: inner_start_position, + value: &state.wiki_text[inner_start_position..end], + }, + ); + } + } + state.flush(position); + let nodes = std::mem::replace(&mut state.nodes, open_node.nodes); + state.nodes.push(crate::Node::Heading { + end: end_position, + level: end_level, + nodes, + start: open_node.start, + }); + state.scan_position += 1; + state.skip_empty_lines(); +} + +pub fn parse_heading_start(state: &mut crate::State) { + let mut level = 1; + while state.get_byte(state.scan_position + level) == Some(b'=') && level < 6 { + level += 1; + } + let position = state.skip_whitespace_forwards(state.scan_position + level); + state.flushed_position = position; + state.push_open_node( + crate::OpenNodeType::Heading { level: level as u8 }, + position, + ); +} diff --git a/parse_wiki_text/src/html_entities.rs b/parse_wiki_text/src/html_entities.rs new file mode 100644 index 0000000..6625eb8 --- /dev/null +++ b/parse_wiki_text/src/html_entities.rs @@ -0,0 +1,259 @@ +// Copyright 2019 Fredrik Portström <https://portstrom.com> +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +pub const HTML_ENTITIES: &[(&str, char)] = &[ + ("AElig", '\u{c6}'), + ("Aacute", '\u{c1}'), + ("Acirc", '\u{c2}'), + ("Agrav", '\u{c0}'), + ("Alpha", '\u{391}'), + ("Aring", '\u{c5}'), + ("Atilde", '\u{c3}'), + ("Auml", '\u{c4}'), + ("Beta", '\u{392}'), + ("Ccedil", '\u{c7}'), + ("Chi", '\u{3a7}'), + ("Dagger", '\u{2021}'), + ("Delta", '\u{394}'), + ("ETH", '\u{d0}'), + ("Eacute", '\u{c9}'), + ("Ecirc", '\u{ca}'), + ("Egrave", '\u{c8}'), + ("Epsilon", '\u{395}'), + ("Eta", '\u{397}'), + ("Euml", '\u{cb}'), + ("Gamma", '\u{393}'), + ("Iacute", '\u{cd}'), + ("Icirc", '\u{ce}'), + ("Igrave", '\u{cc}'), + ("Iota", '\u{399}'), + ("Iuml", '\u{cf}'), + ("Kappa", '\u{39a}'), + ("Lambda", '\u{39b}'), + ("Mu", '\u{39c}'), + ("Ntilde", '\u{d1}'), + ("Nu", '\u{39d}'), + ("OElig", '\u{152}'), + ("Oacute", '\u{d3}'), + ("Ocirc", '\u{d4}'), + ("Ograve", '\u{d2}'), + ("Omega", '\u{3a9}'), + ("Omicron", '\u{39f}'), + ("Oslash", '\u{d8}'), + ("Otilde", '\u{d5}'), + ("Ouml", '\u{d6}'), + ("Phi", '\u{3a6}'), + ("Pi", '\u{3a0}'), + ("Prime", '\u{2033}'), + ("Psi", '\u{3a8}'), + ("Rho", '\u{3a1}'), + ("Scaron", '\u{160}'), + ("Sigma", '\u{3a3}'), + ("THORN", '\u{de}'), + ("Tau", '\u{3a4}'), + ("Theta", '\u{398}'), + ("Uacute", '\u{da}'), + ("Ucirc", '\u{db}'), + ("Ugrave", '\u{d9}'), + ("Upsilon", '\u{3a5}'), + ("Uuml", '\u{dc}'), + ("Xi", '\u{39e}'), + ("Yacute", '\u{dd}'), + ("Yuml", '\u{178}'), + ("Zeta", '\u{396}'), + ("aacute", '\u{e1}'), + ("acirc", '\u{e2}'), + ("acute", '\u{b4}'), + ("aelig", '\u{e6}'), + ("agrave", '\u{e0}'), + ("alefsym", '\u{2135}'), + ("alpha", '\u{3b1}'), + ("amp", '\u{26}'), + ("and", '\u{2227}'), + ("ang", '\u{2220}'), + ("apos", '\u{27}'), + ("aring", '\u{e5}'), + ("asymp", '\u{2248}'), + ("atilde", '\u{e3}'), + ("auml", '\u{e4}'), + ("bdquo", '\u{201e}'), + ("beta", '\u{3b2}'), + ("brvbar", '\u{a6}'), + ("bull", '\u{2022}'), + ("cap", '\u{2229}'), + ("ccedil", '\u{e7}'), + ("cedil", '\u{b8}'), + ("cent", '\u{a2}'), + ("chi", '\u{3c7}'), + ("circ", '\u{2c6}'), + ("clubs", '\u{2663}'), + ("cong", '\u{2245}'), + ("copy", '\u{a9}'), + ("crarr", '\u{21b5}'), + ("cup", '\u{222a}'), + ("curren", '\u{a4}'), + ("dArr", '\u{21d3}'), + ("dagger", '\u{2020}'), + ("darr", '\u{2193}'), + ("deg", '\u{b0}'), + ("delta", '\u{3b4}'), + ("diams", '\u{2666}'), + ("divide", '\u{f7}'), + ("eacute", '\u{e9}'), + ("ecirc", '\u{ea}'), + ("egrave", '\u{e8}'), + ("empty", '\u{2205}'), + ("emsp", '\u{2003}'), + ("ensp", '\u{2002}'), + ("epsilon", '\u{3b5}'), + ("equiv", '\u{2261}'), + ("eta", '\u{3b7}'), + ("eth", '\u{f0}'), + ("euml", '\u{eb}'), + ("euro", '\u{20ac}'), + ("exist", '\u{2203}'), + ("fnof", '\u{192}'), + ("forall", '\u{2200}'), + ("frac12", '\u{bd}'), + ("frac14", '\u{bc}'), + ("frac34", '\u{be}'), + ("frasl", '\u{2044}'), + ("gamma", '\u{3b3}'), + ("ge", '\u{2265}'), + ("gt", '\u{3e}'), + ("hArr", '\u{21d4}'), + ("harr", '\u{2194}'), + ("hearts", '\u{2665}'), + ("hellip", '\u{2026}'), + ("iacute", '\u{ed}'), + ("icirc", '\u{ee}'), + ("iexcl", '\u{a1}'), + ("igrave", '\u{ec}'), + ("image", '\u{2111}'), + ("infin", '\u{221e}'), + ("int", '\u{222b}'), + ("iota", '\u{3b9}'), + ("iquest", '\u{bf}'), + ("isin", '\u{2208}'), + ("iuml", '\u{ef}'), + ("kappa", '\u{3ba}'), + ("lArr", '\u{21d0}'), + ("lambda", '\u{3bb}'), + ("lang", '\u{2329}'), + ("laquo", '\u{ab}'), + ("larr", '\u{2190}'), + ("lceil", '\u{2308}'), + ("ldquo", '\u{201c}'), + ("le", '\u{2264}'), + ("lfloor", '\u{230a}'), + ("lowast", '\u{2217}'), + ("loz", '\u{25ca}'), + ("lrm", '\u{200e}'), + ("lsaquo", '\u{2039}'), + ("lsquo", '\u{2018}'), + ("lt", '\u{3c}'), + ("macr", '\u{af}'), + ("mdash", '\u{2014}'), + ("micro", '\u{b5}'), + ("middot", '\u{b7}'), + ("minus", '\u{2212}'), + ("mu", '\u{3bc}'), + ("nabla", '\u{2207}'), + ("nbsp", '\u{a0}'), + ("ndash", '\u{2013}'), + ("ne", '\u{2260}'), + ("ni", '\u{220b}'), + ("not", '\u{ac}'), + ("notin", '\u{2209}'), + ("nsub", '\u{2284}'), + ("ntilde", '\u{f1}'), + ("nu", '\u{3bd}'), + ("oacute", '\u{f3}'), + ("ocirc", '\u{f4}'), + ("oelig", '\u{153}'), + ("ograve", '\u{f2}'), + ("oline", '\u{203e}'), + ("omega", '\u{3c9}'), + ("omicron", '\u{3bf}'), + ("oplus", '\u{2295}'), + ("or", '\u{2228}'), + ("ordf", '\u{aa}'), + ("ordm", '\u{ba}'), + ("oslash", '\u{f8}'), + ("otilde", '\u{f5}'), + ("otimes", '\u{2297}'), + ("ouml", '\u{f6}'), + ("para", '\u{b6}'), + ("part", '\u{2202}'), + ("permil", '\u{2030}'), + ("perp", '\u{22a5}'), + ("phi", '\u{3c6}'), + ("pi", '\u{3c0}'), + ("piv", '\u{3d6}'), + ("plusmn", '\u{b1}'), + ("pound", '\u{a3}'), + ("prime", '\u{2032}'), + ("prod", '\u{220f}'), + ("prop", '\u{221d}'), + ("psi", '\u{3c8}'), + ("quot", '\u{22}'), + ("rArr", '\u{21d2}'), + ("radic", '\u{221a}'), + ("rang", '\u{232a}'), + ("raquo", '\u{bb}'), + ("rarr", '\u{2192}'), + ("rceil", '\u{2309}'), + ("rdquo", '\u{201d}'), + ("real", '\u{211c}'), + ("reg", '\u{ae}'), + ("rfloor", '\u{230b}'), + ("rho", '\u{3c1}'), + ("rlm", '\u{200f}'), + ("rsaquo", '\u{203a}'), + ("rsquo", '\u{2019}'), + ("sbquo", '\u{201a}'), + ("scaron", '\u{161}'), + ("sdot", '\u{22c5}'), + ("sect", '\u{a7}'), + ("shy", '\u{ad}'), + ("sigma", '\u{3c3}'), + ("sigmaf", '\u{3c2}'), + ("sim", '\u{223c}'), + ("spades", '\u{2660}'), + ("sub", '\u{2282}'), + ("sube", '\u{2286}'), + ("sum", '\u{2211}'), + ("sup", '\u{2283}'), + ("sup1", '\u{b9}'), + ("sup2", '\u{b2}'), + ("sup3", '\u{b3}'), + ("supe", '\u{2287}'), + ("szlig", '\u{df}'), + ("tau", '\u{3c4}'), + ("there4", '\u{2234}'), + ("theta", '\u{3b8}'), + ("thetasym", '\u{3d1}'), + ("thinsp", '\u{2009}'), + ("thorn", '\u{fe}'), + ("tilde", '\u{2dc}'), + ("times", '\u{d7}'), + ("trade", '\u{2122}'), + ("uArr", '\u{21d1}'), + ("uacute", '\u{fa}'), + ("uarr", '\u{2191}'), + ("ucirc", '\u{fb}'), + ("ugrave", '\u{f9}'), + ("uml", '\u{a8}'), + ("upsih", '\u{3d2}'), + ("upsilon", '\u{3c5}'), + ("uuml", '\u{fc}'), + ("weierp", '\u{2118}'), + ("xi", '\u{3be}'), + ("yacute", '\u{fd}'), + ("yen", '\u{a5}'), + ("yuml", '\u{ff}'), + ("zeta", '\u{3b6}'), + ("zwj", '\u{200d}'), + ("zwnj", '\u{200c}'), +]; diff --git a/parse_wiki_text/src/lib.rs b/parse_wiki_text/src/lib.rs new file mode 100644 index 0000000..0eac2ea --- /dev/null +++ b/parse_wiki_text/src/lib.rs @@ -0,0 +1,604 @@ +// Copyright 2019 Fredrik Portström <https://portstrom.com> +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +//! Parse wiki text from Mediawiki into a tree of elements. +//! +//! # Introduction +//! +//! Wiki text is a format that follows the PHP maxim “Make everything as inconsistent and confusing as possible”. There are hundreds of millions of interesting documents written in this format, distributed under free licenses on sites that use the Mediawiki software, mainly Wikipedia and Wiktionary. Being able to parse wiki text and process these documents would allow access to a significant part of the world's knowledge. +//! +//! The Mediawiki software itself transforms a wiki text document into an HTML document in an outdated format to be displayed in a browser for a human reader. It does so through a [step by step procedure](https://www.mediawiki.org/wiki/Manual:Parser.php) of string substitutions, with some of the steps depending on the result of previous steps. [The main file for this procedure](https://doc.wikimedia.org/mediawiki-core/master/php/Parser_8php_source.html) has 6200 lines of code and the [second biggest file](https://doc.wikimedia.org/mediawiki-core/master/php/Preprocessor__DOM_8php_source.html) has 2000, and then there is a [1400 line file](https://doc.wikimedia.org/mediawiki-core/master/php/ParserOptions_8php_source.html) just to take options for the parser. +//! +//! What would be more interesting is to parse the wiki text document into a structure that can be used by a computer program to reason about the facts in the document and present them in different ways, making them available for a great variety of applications. +//! +//! Some people have tried to parse wiki text using regular expressions. This is incredibly naive and fails as soon as the wiki text is non-trivial. The capabilities of regular expressions don't come anywhere close to the complexity of the weirdness required to correctly parse wiki text. One project did a brave attempt to use a parser generator to parse wiki text. Wiki text was however never designed for formal parsers, so even parser generators are of no help in correctly parsing wiki text. +//! +//! Wiki text has a long history of poorly designed additions carelessly piled on top of each other. The syntax of wiki text is different in each wiki depending on its configuration. You can't even know what's a start tag until you see the corresponding end tag, and you can't know where the end tag is unless you parse the entire hierarchy of nested tags between the start tag and the end tag. In short: If you think you understand wiki text, you don't understand wiki text. +//! +//! Parse Wiki Text attempts to take all uncertainty out of parsing wiki text by converting it to another format that is easy to work with. The target format is Rust objects that can ergonomically be processed using iterators and match expressions. +//! +//! # Design goals +//! +//! ## Correctness +//! +//! Parse Wiki Text is designed to parse wiki text exactly as parsed by Mediawiki. Even when there is obviously a bug in Mediawiki, Parse Wiki Text replicates that exact bug. If there is something Parse Wiki Text doesn't parse exactly the same as Mediawiki, please report it as an issue. +//! +//! ## Speed +//! +//! Parse Wiki Text is designed to parse a page in as little time as possible. It parses tens of thousands of pages per second on each processor core and can quickly parse an entire wiki with millions of pages. If there is anything that can be changed to make Parse Wiki Text faster, please report it as an issue. +//! +//! ## Safety +//! +//! Parse Wiki Text is designed to work with untrusted inputs. If any input doesn't parse safely with reasonable resources, please report it as an issue. No unsafe code is used. +//! +//! ## Platform support +//! +//! Parse Wiki Text is designed to run in a wide variety of environments, such as: +//! +//! - servers running machine code +//! - browsers running Web Assembly +//! - embedded in other programming languages +//! +//! Parse Wiki Text can be deployed anywhere with no dependencies. +//! +//! # Caution +//! +//! Wiki text is a legacy format used by legacy software. Parse Wiki Text is intended only to recover information that has been written for wikis running legacy software, replicating the exact bugs found in the legacy software. Please don't use wiki text as a format for new applications. Wiki text is a horrible format with an astonishing amount of inconsistencies, bad design choices and bugs. For new applications, please use a format that is designed to be easy to process, such as JSON or even better [CBOR](http://cbor.io). See [Wikidata](https://www.wikidata.org/wiki/Wikidata:Main_Page) for an example of a wiki that uses JSON as its format and provides a rich interface for editing data instead of letting people write code. If you need to take information written in wiki text and reuse it in a new application, you can use Parse Wiki Text to convert it to an intermediate format that you can further process into a modern format. +//! +//! # Site configuration +//! +//! Wiki text has plenty of features that are parsed in a way that depends on the configuration of the wiki. This means the configuration must be known before parsing. +//! +//! - External links are parsed only when the scheme of the URI of the link is in the configured list of valid protocols. When the scheme is not valid, the link is parsed as plain text. +//! - Categories and images superficially look they same way as links, but are parsed differently. These can only be distinguished by knowing the namespace aliases from the configuration of the wiki. +//! - Text matching the configured set of magic words is parsed as magic words. +//! - Extension tags have the same syntax as HTML tags, but are parsed differently. The configuration tells which tag names are to be treated as extension tags. +//! +//! The configuration can be seen by making a request to the [site info](https://www.mediawiki.org/wiki/API:Siteinfo) resource on the wiki. The utility [Fetch site configuration](https://github.com/portstrom/fetch_site_configuration) fetches the parts of the configuration needed for parsing pages in the wiki, and outputs Rust code for instantiating a parser with that configuration. Parse Wiki Text contains a default configuration that can be used for testing. +//! +//! # Limitations +//! +//! Wiki text was never designed to be possible to parse into a structured format. It's designed to be parsed in multiple passes, where each pass depends on the output on the previous pass. Most importantly, templates are expanded in an earlier pass and formatting codes are parsed in a later pass. This means the formatting codes you see in the original text are not necessarily the same as the parser will see after templates have been expanded. Luckily this is as bad for human editors as it is for computers, so people tend to avoid writing templates that cause formatting codes to be parsed in a way that differs from what they would expect from reading the original wiki text before expanding templates. Parse Wiki Text assumes that templates never change the meaning of formatting codes around them. +//! +//! # Sandbox +//! +//! A sandbox ([Github](https://github.com/portstrom/parse_wiki_text_sandbox), [try online](https://portstrom.com/parse_wiki_text_sandbox/)) is available that allows interactively entering wiki text and inspecting the result of parsing it. +//! +//! # Comparison with Mediawiki Parser +//! +//! There is another crate called Mediawiki Parser ([crates.io](https://crates.io/crates/mediawiki_parser), [Github](https://github.com/vroland/mediawiki-parser)) that does basically the same thing, parsing wiki text to a tree of elements. That crate however doesn't take into account any of the astonishing amount of weirdness required to correctly parse wiki text. That crate admittedly only parses a subset of wiki text, with the intention to report errors for any text that is too weird to fit that subset, which is a good intention, but when examining it, that subset is quickly found to be too small to parse pages from actual wikis, and even worse, the error reporting is just an empty promise, and there's no indication when a text is incorrectly parsed. +//! +//! That crate could possibly be improved to always report errors when a text isn't in the supported subset, but pages found in real wikis very often don't conform to the small subset of wiki text that can be parsed without weirdness, so it still wouldn't be useful. Improving that crate to correctly parse a large enough subset of wiki text would be as much effort as starting over from scratch, which is why Parse Wiki Text was made without taking anything from Mediawiki Parser. Parse Wiki Text aims to correctly parse all wiki text, not just a subset, and report warnings when encountering weirdness that should be avoided. +//! +//! # Examples +//! +//! The default configuration is used for testing purposes only. +//! For parsing a real wiki you need a site-specific configuration. +//! Reuse the same configuration when parsing multiple pages for efficiency. +//! +//! ``` +//! use parse_wiki_text::{Configuration, Node}; +//! let wiki_text = concat!( +//! "==Our values==\n", +//! "*Correctness\n", +//! "*Speed\n", +//! "*Ergonomics" +//! ); +//! let result = Configuration::default().parse(wiki_text); +//! assert!(result.warnings.is_empty()); +//! # let mut found = false; +//! for node in result.nodes { +//! if let Node::UnorderedList { items, .. } = node { +//! println!("Our values are:"); +//! for item in items { +//! println!("- {}", item.nodes.iter().map(|node| match node { +//! Node::Text { value, .. } => value, +//! _ => "" +//! }).collect::<String>()); +//! # found = true; +//! } +//! } +//! } +//! # assert!(found); +//! ``` + +#![forbid(unsafe_code)] +#![warn(missing_docs)] + +mod bold_italic; +mod case_folding_simple; +mod character_entity; +mod comment; +mod configuration; +mod default; +mod external_link; +mod heading; +mod html_entities; +mod line; +mod link; +mod list; +mod magic_word; +mod parse; +mod positioned; +mod redirect; +mod state; +mod table; +mod tag; +mod template; +mod trie; +mod warning; + +pub use configuration::ConfigurationSource; +use configuration::Namespace; +use state::{OpenNode, OpenNodeType, State}; +use std::{ + borrow::Cow, + collections::{HashMap, HashSet}, +}; +use trie::Trie; +pub use warning::{Warning, WarningMessage}; + +/// Configuration for the parser. +/// +/// A configuration to correctly parse a real wiki can be created with `Configuration::new`. A configuration for testing and quick and dirty prototyping can be created with `Default::default`. +pub struct Configuration { + character_entities: Trie<char>, + link_trail_character_set: HashSet<char>, + magic_words: Trie<()>, + namespaces: Trie<Namespace>, + protocols: Trie<()>, + redirect_magic_words: Trie<()>, + tag_name_map: HashMap<String, TagClass>, +} + +/// List item of a definition list. +#[derive(Debug)] +pub struct DefinitionListItem<'a> { + /// The byte position in the wiki text where the element ends. + pub end: usize, + + /// The content of the element. + pub nodes: Vec<Node<'a>>, + + /// The byte position in the wiki text where the element starts. + pub start: usize, + + /// The type of list item. + pub type_: DefinitionListItemType, +} + +/// Identifier for the type of a definition list item. +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub enum DefinitionListItemType { + /// Parsed from the code `:`. + Details, + + /// Parsed from the code `;`. + Term, +} + +/// List item of an ordered list or unordered list. +#[derive(Debug)] +pub struct ListItem<'a> { + /// The byte position in the wiki text where the element ends. + pub end: usize, + + /// The content of the element. + pub nodes: Vec<Node<'a>>, + + /// The byte position in the wiki text where the element starts. + pub start: usize, +} + +/// Parsed node. +#[derive(Debug)] +pub enum Node<'a> { + /// Toggle bold text. Parsed from the code `'''`. + Bold { + /// The byte position in the wiki text where the element ends. + end: usize, + + /// The byte position in the wiki text where the element starts. + start: usize, + }, + + /// Toggle bold and italic text. Parsed from the code `'''''`. + BoldItalic { + /// The byte position in the wiki text where the element ends. + end: usize, + + /// The byte position in the wiki text where the element starts. + start: usize, + }, + + /// Category. Parsed from code starting with `[[`, a category namespace and `:`. + Category { + /// The byte position in the wiki text where the element ends. + end: usize, + + /// Additional information for sorting entries on the category page, if any. + ordinal: Vec<Node<'a>>, + + /// The byte position in the wiki text where the element starts. + start: usize, + + /// The category referred to. + target: &'a str, + }, + + /// Character entity. Parsed from code starting with `&` and ending with `;`. + CharacterEntity { + /// The character represented. + character: char, + + /// The byte position in the wiki text where the element ends. + end: usize, + + /// The byte position in the wiki text where the element starts. + start: usize, + }, + + /// Comment. Parsed from code starting with `<!--`. + Comment { + /// The byte position in the wiki text where the element ends. + end: usize, + + /// The byte position in the wiki text where the element starts. + start: usize, + }, + + /// Definition list. Parsed from code starting with `:` or `;`. + DefinitionList { + /// The byte position in the wiki text where the element ends. + end: usize, + + /// The list items of the list. + items: Vec<DefinitionListItem<'a>>, + + /// The byte position in the wiki text where the element starts. + start: usize, + }, + + /// End tag. Parsed from code starting with `</` and a valid tag name. + EndTag { + /// The byte position in the wiki text where the element ends. + end: usize, + + /// The tag name. + name: Cow<'a, str>, + + /// The byte position in the wiki text where the element starts. + start: usize, + }, + + /// External link. Parsed from code starting with `[` and a valid protocol. + ExternalLink { + /// The byte position in the wiki text where the element ends. + end: usize, + + /// The content of the element. + nodes: Vec<Node<'a>>, + + /// The byte position in the wiki text where the element starts. + start: usize, + }, + + /// Heading. Parsed from code starting with `=` and ending with `=`. + Heading { + /// The byte position in the wiki text where the element ends. + end: usize, + + /// The level of the heading from 1 to 6. + level: u8, + + /// The content of the element. + nodes: Vec<Node<'a>>, + + /// The byte position in the wiki text where the element starts. + start: usize, + }, + + /// Horizontal divider. Parsed from code starting with `----`. + HorizontalDivider { + /// The byte position in the wiki text where the element ends. + end: usize, + + /// The byte position in the wiki text where the element starts. + start: usize, + }, + + /// Image. Parsed from code starting with `[[`, a file namespace and `:`. + Image { + /// The byte position in the wiki text where the element ends. + end: usize, + + /// The byte position in the wiki text where the element starts. + start: usize, + + /// The file name of the image. + target: &'a str, + + /// Additional information for the image. + text: Vec<Node<'a>>, + }, + + /// Toggle italic text. Parsed from the code `''`. + Italic { + /// The byte position in the wiki text where the element ends. + end: usize, + + /// The byte position in the wiki text where the element starts. + start: usize, + }, + + /// Link. Parsed from code starting with `[[` and ending with `]]`. + Link { + /// The byte position in the wiki text where the element ends. + end: usize, + + /// The byte position in the wiki text where the element starts. + start: usize, + + /// The target of the link. + target: &'a str, + + /// The text to display for the link. + text: Vec<Node<'a>>, + }, + + /// Magic word. Parsed from the code `__`, a valid magic word and `__`. + MagicWord { + /// The byte position in the wiki text where the element ends. + end: usize, + + /// The byte position in the wiki text where the element starts. + start: usize, + }, + + /// Ordered list. Parsed from code starting with `#`. + OrderedList { + /// The byte position in the wiki text where the element ends. + end: usize, + + /// The list items of the list. + items: Vec<ListItem<'a>>, + + /// The byte position in the wiki text where the element starts. + start: usize, + }, + + /// Paragraph break. Parsed from an empty line between elements that can appear within a paragraph. + ParagraphBreak { + /// The byte position in the wiki text where the element ends. + end: usize, + + /// The byte position in the wiki text where the element starts. + start: usize, + }, + + /// Parameter. Parsed from code starting with `{{{` and ending with `}}}`. + Parameter { + /// The default value of the parameter. + default: Option<Vec<Node<'a>>>, + + /// The byte position in the wiki text where the element ends. + end: usize, + + /// The name of the parameter. + name: Vec<Node<'a>>, + + /// The byte position in the wiki text where the element starts. + start: usize, + }, + + /// Block of preformatted text. Parsed from code starting with a space at the beginning of a line. + Preformatted { + /// The byte position in the wiki text where the element ends. + end: usize, + + /// The content of the element. + nodes: Vec<Node<'a>>, + + /// The byte position in the wiki text where the element starts. + start: usize, + }, + + /// Redirect. Parsed at the start of the wiki text from code starting with `#` followed by a redirect magic word. + Redirect { + /// The byte position in the wiki text where the element ends. + end: usize, + + /// The target of the redirect. + target: &'a str, + + /// The byte position in the wiki text where the element starts. + start: usize, + }, + + /// Start tag. Parsed from code starting with `<` and a valid tag name. + StartTag { + /// The byte position in the wiki text where the element ends. + end: usize, + + /// The tag name. + name: Cow<'a, str>, + + /// The byte position in the wiki text where the element starts. + start: usize, + }, + + /// Table. Parsed from code starting with `{|`. + Table { + /// The HTML attributes of the element. + attributes: Vec<Node<'a>>, + + /// The captions of the table. + captions: Vec<TableCaption<'a>>, + + /// The byte position in the wiki text where the element ends. + end: usize, + + /// The rows of the table. + rows: Vec<TableRow<'a>>, + + /// The byte position in the wiki text where the element starts. + start: usize, + }, + + /// Extension tag. Parsed from code starting with `<` and the tag name of a valid extension tag. + Tag { + /// The byte position in the wiki text where the element ends. + end: usize, + + /// The tag name. + name: Cow<'a, str>, + + /// The content of the tag, between the start tag and the end tag, if any. + nodes: Vec<Node<'a>>, + + /// The byte position in the wiki text where the element starts. + start: usize, + }, + + /// Template. Parsed from code starting with `{{` and ending with `}}`. + Template { + /// The byte position in the wiki text where the element ends. + end: usize, + + /// The name of the template. + name: Vec<Node<'a>>, + + /// The parameters of the template. + parameters: Vec<Parameter<'a>>, + + /// The byte position in the wiki text where the element starts. + start: usize, + }, + + /// Plain text. + Text { + /// The byte position in the wiki text where the element ends. + end: usize, + + /// The byte position in the wiki text where the element starts. + start: usize, + + /// The text. + value: &'a str, + }, + + /// Unordered list. Parsed from code starting with `*`. + UnorderedList { + /// The byte position in the wiki text where the element ends. + end: usize, + + /// The list items of the list. + items: Vec<ListItem<'a>>, + + /// The byte position in the wiki text where the element starts. + start: usize, + }, +} + +/// Output of parsing wiki text. +#[derive(Debug)] +pub struct Output<'a> { + /// The top level of parsed nodes. + pub nodes: Vec<Node<'a>>, + + /// Warnings from the parser telling that something is not well-formed. + pub warnings: Vec<Warning>, +} + +/// Template parameter. +#[derive(Debug)] +pub struct Parameter<'a> { + /// The byte position in the wiki text where the element ends. + pub end: usize, + + /// The name of the parameter, if any. + pub name: Option<Vec<Node<'a>>>, + + /// The byte position in the wiki text where the element starts. + pub start: usize, + + /// The value of the parameter. + pub value: Vec<Node<'a>>, +} + +/// Element that has a start position and end position. +pub trait Positioned { + /// The byte position in the wiki text where the element ends. + fn end(&self) -> usize; + + /// The byte position in the wiki text where the element starts. + fn start(&self) -> usize; +} + +#[derive(Copy, Clone, Debug, Eq, Hash, PartialEq)] +enum TagClass { + ExtensionTag, + Tag, +} + +/// Table caption. +#[derive(Debug)] +pub struct TableCaption<'a> { + /// The HTML attributes of the element. + pub attributes: Option<Vec<Node<'a>>>, + + /// The content of the element. + pub content: Vec<Node<'a>>, + + /// The byte position in the wiki text where the element ends. + pub end: usize, + + /// The byte position in the wiki text where the element starts. + pub start: usize, +} + +/// Table cell. +#[derive(Debug)] +pub struct TableCell<'a> { + /// The HTML attributes of the element. + pub attributes: Option<Vec<Node<'a>>>, + + /// The content of the element. + pub content: Vec<Node<'a>>, + + /// The byte position in the wiki text where the element ends. + pub end: usize, + + /// The byte position in the wiki text where the element starts. + pub start: usize, + + /// The type of cell. + pub type_: TableCellType, +} + +/// Type of table cell. +#[derive(Copy, Clone, Debug, Eq, Hash, PartialEq)] +pub enum TableCellType { + /// Heading cell. + Heading, + + /// Ordinary cell. + Ordinary, +} + +/// Table row. +#[derive(Debug)] +pub struct TableRow<'a> { + /// The HTML attributes of the element. + pub attributes: Vec<Node<'a>>, + + /// The cells in the row. + pub cells: Vec<TableCell<'a>>, + + /// The byte position in the wiki text where the element ends. + pub end: usize, + + /// The byte position in the wiki text where the element starts. + pub start: usize, +} diff --git a/parse_wiki_text/src/line.rs b/parse_wiki_text/src/line.rs new file mode 100644 index 0000000..f906cb4 --- /dev/null +++ b/parse_wiki_text/src/line.rs @@ -0,0 +1,248 @@ +// Copyright 2019 Fredrik Portström <https://portstrom.com> +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +pub fn parse_beginning_of_line(state: &mut crate::State, line_start_position: Option<usize>) { + let mut has_line_break = false; + 'a: loop { + match state.get_byte(state.scan_position) { + None => { + if line_start_position.is_none() { + state.flushed_position = state.scan_position; + } + return; + } + Some(b'\t') => { + state.scan_position += 1; + loop { + match state.get_byte(state.scan_position) { + None | Some(b'\n') => continue 'a, + Some(b'\t') | Some(b' ') => state.scan_position += 1, + Some(_) => break 'a, + } + } + } + Some(b'\n') => { + if has_line_break { + state.warnings.push(crate::Warning { + end: state.scan_position + 1, + message: crate::WarningMessage::RepeatedEmptyLine, + start: state.scan_position, + }); + } + has_line_break = true; + state.scan_position += 1; + } + Some(b' ') => { + state.scan_position += 1; + let start_position = state.scan_position; + loop { + match state.get_byte(state.scan_position) { + None => return, + Some(b'\n') => break, + Some(b'\t') | Some(b' ') => state.scan_position += 1, + Some(b'{') if state.get_byte(state.scan_position + 1) == Some(b'|') => { + crate::table::start_table(state, line_start_position); + return; + } + Some(_) => { + if let Some(position) = line_start_position { + let position = state.skip_whitespace_backwards(position); + state.flush(position); + } + state.flushed_position = state.scan_position; + state.push_open_node(crate::OpenNodeType::Preformatted, start_position); + return; + } + } + } + } + Some(b'#') | Some(b'*') | Some(b':') | Some(b';') => { + if let Some(position) = line_start_position { + let position = state.skip_whitespace_backwards(position); + state.flush(position); + } + state.flushed_position = state.scan_position; + while crate::list::parse_list_item_start(state) {} + crate::list::skip_spaces(state); + return; + } + Some(b'-') => { + if state.get_byte(state.scan_position + 1) == Some(b'-') + && state.get_byte(state.scan_position + 2) == Some(b'-') + && state.get_byte(state.scan_position + 3) == Some(b'-') + { + if let Some(position) = line_start_position { + let position = state.skip_whitespace_backwards(position); + state.flush(position); + } + let start = state.scan_position; + state.scan_position += 4; + while state.get_byte(state.scan_position) == Some(b'-') { + state.scan_position += 1; + } + state.nodes.push(crate::Node::HorizontalDivider { + end: state.scan_position, + start, + }); + while let Some(character) = state.get_byte(state.scan_position) { + match character { + b'\t' | b' ' => state.scan_position += 1, + b'\n' => { + state.scan_position += 1; + state.skip_empty_lines(); + break; + } + _ => break, + } + } + state.flushed_position = state.scan_position; + return; + } + break; + } + Some(b'=') => { + if let Some(position) = line_start_position { + let position = state.skip_whitespace_backwards(position); + state.flush(position); + } + crate::heading::parse_heading_start(state); + return; + } + Some(b'{') => { + if state.get_byte(state.scan_position + 1) == Some(b'|') { + crate::table::start_table(state, line_start_position); + return; + } + break; + } + Some(_) => break, + } + } + match line_start_position { + None => state.flushed_position = state.scan_position, + Some(position) => { + if has_line_break { + let flush_position = state.skip_whitespace_backwards(position); + state.flush(flush_position); + state.nodes.push(crate::Node::ParagraphBreak { + end: state.scan_position, + start: position, + }); + state.flushed_position = state.scan_position; + } + } + } +} + +pub fn parse_end_of_line(state: &mut crate::State) { + match state.stack.last() { + None => { + let position = state.scan_position; + state.scan_position += 1; + parse_beginning_of_line(state, Some(position)); + } + Some(crate::OpenNode { + type_: crate::OpenNodeType::DefinitionList { .. }, + .. + }) + | Some(crate::OpenNode { + type_: crate::OpenNodeType::OrderedList { .. }, + .. + }) + | Some(crate::OpenNode { + type_: crate::OpenNodeType::UnorderedList { .. }, + .. + }) => { + crate::list::parse_list_end_of_line(state); + } + Some(crate::OpenNode { + type_: crate::OpenNodeType::ExternalLink { .. }, + .. + }) => { + crate::external_link::parse_external_link_end_of_line(state); + } + Some(crate::OpenNode { + type_: crate::OpenNodeType::Heading { .. }, + .. + }) => { + crate::heading::parse_heading_end(state); + } + Some(crate::OpenNode { + type_: crate::OpenNodeType::Link { .. }, + .. + }) + | Some(crate::OpenNode { + type_: crate::OpenNodeType::Parameter { .. }, + .. + }) + | Some(crate::OpenNode { + type_: crate::OpenNodeType::Tag { .. }, + .. + }) + | Some(crate::OpenNode { + type_: crate::OpenNodeType::Template { .. }, + .. + }) => { + state.scan_position += 1; + } + Some(crate::OpenNode { + type_: crate::OpenNodeType::Preformatted, + .. + }) => { + parse_preformatted_end_of_line(state); + } + Some(crate::OpenNode { + type_: crate::OpenNodeType::Table { .. }, + .. + }) => { + crate::table::parse_table_end_of_line(state, true); + } + } +} + +fn parse_preformatted_end_of_line(state: &mut crate::State) { + if state.get_byte(state.scan_position + 1) == Some(b' ') { + let mut position = state.scan_position + 2; + loop { + match state.get_byte(position) { + None => break, + Some(b'\t') | Some(b' ') => position += 1, + Some(b'{') if state.get_byte(position + 1) == Some(b'|') => { + break; + } + Some(b'|') + if state.get_byte(position + 1) == Some(b'}') + && state.stack.len() > 1 + && match state.stack.get(state.stack.len() - 2) { + Some(crate::OpenNode { + type_: crate::OpenNodeType::Table { .. }, + .. + }) => true, + _ => false, + } => + { + break; + } + Some(_) => { + let position = state.scan_position + 1; + state.flush(position); + state.scan_position += 2; + state.flushed_position = state.scan_position; + return; + } + } + } + } + let open_node = state.stack.pop().unwrap(); + let position = state.skip_whitespace_backwards(state.scan_position); + state.flush(position); + state.scan_position += 1; + let nodes = std::mem::replace(&mut state.nodes, open_node.nodes); + state.nodes.push(crate::Node::Preformatted { + end: state.scan_position, + nodes, + start: open_node.start, + }); + state.skip_empty_lines(); +} diff --git a/parse_wiki_text/src/link.rs b/parse_wiki_text/src/link.rs new file mode 100644 index 0000000..24b3ba5 --- /dev/null +++ b/parse_wiki_text/src/link.rs @@ -0,0 +1,196 @@ +// Copyright 2019 Fredrik Portström <https://portstrom.com> +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +pub fn parse_link_end<'a>( + state: &mut crate::State<'a>, + configuration: &crate::Configuration, + start_position: usize, + nodes: Vec<crate::Node<'a>>, + namespace: Option<crate::Namespace>, + target: &'a str, +) { + let inner_end_position = state.skip_whitespace_backwards(state.scan_position); + state.flush(inner_end_position); + state.scan_position += 2; + state.flushed_position = state.scan_position; + let mut text = std::mem::replace(&mut state.nodes, nodes); + let end = state.scan_position; + let start = start_position; + state.nodes.push(match namespace { + None => { + let mut trail_end_position = end; + for character in state.wiki_text[end..].chars() { + if !configuration.link_trail_character_set.contains(&character) { + break; + } + trail_end_position += character.len_utf8(); + } + if trail_end_position > end { + text.push(crate::Node::Text { + end: trail_end_position, + start: end, + value: &state.wiki_text[end..trail_end_position], + }); + } + crate::Node::Link { + end: trail_end_position, + start, + target, + text, + } + } + Some(crate::Namespace::Category) => crate::Node::Category { + end, + ordinal: text, + start, + target, + }, + Some(crate::Namespace::File) => crate::Node::Image { + end, + start, + target, + text, + }, + }); +} + +pub fn parse_link_start(state: &mut crate::State, configuration: &crate::Configuration) { + if match state.stack.last() { + Some(crate::OpenNode { + type_: crate::OpenNodeType::Link { namespace, .. }, + .. + }) => *namespace != Some(crate::Namespace::File), + _ => false, + } { + let open_node = state.stack.pop().unwrap(); + state.warnings.push(crate::Warning { + end: state.scan_position, + message: crate::WarningMessage::InvalidLinkSyntax, + start: open_node.start, + }); + state.rewind(open_node.nodes, open_node.start); + return; + } + let mut target_end_position; + let target_start_position = state.skip_whitespace_forwards(state.scan_position + 2); + let namespace = match configuration + .namespaces + .find(&state.wiki_text[target_start_position..]) + { + Err(match_length) => { + target_end_position = match_length + target_start_position; + None + } + Ok((match_length, namespace)) => { + target_end_position = match_length + target_start_position; + Some(namespace) + } + }; + loop { + match state.get_byte(target_end_position) { + None | Some(b'\n') | Some(b'[') | Some(b'{') | Some(b'}') => { + parse_unexpected_end(state, target_end_position); + break; + } + Some(b']') => { + parse_end( + state, + configuration, + target_start_position, + target_end_position, + namespace, + ); + break; + } + Some(b'|') => { + state.push_open_node( + crate::OpenNodeType::Link { + namespace, + target: &state.wiki_text[target_start_position..target_end_position], + }, + target_end_position + 1, + ); + break; + } + _ => target_end_position += 1, + } + } +} + +fn parse_end( + state: &mut crate::State, + configuration: &crate::Configuration, + target_start_position: usize, + target_end_position: usize, + namespace: Option<crate::Namespace>, +) { + if state.get_byte(target_end_position + 1) != Some(b']') { + parse_unexpected_end(state, target_end_position); + return; + } + let start_position = state.scan_position; + state.flush(start_position); + let trail_start_position = target_end_position + 2; + let mut trail_end_position = trail_start_position; + match namespace { + Some(crate::Namespace::Category) => { + state.nodes.push(crate::Node::Category { + end: trail_end_position, + ordinal: vec![], + start: state.scan_position, + target: state.wiki_text[target_start_position..target_end_position].trim_right(), + }); + } + Some(crate::Namespace::File) => { + state.nodes.push(crate::Node::Image { + end: trail_end_position, + start: state.scan_position, + target: state.wiki_text[target_start_position..target_end_position].trim_right(), + text: vec![], + }); + } + None => { + for character in state.wiki_text[trail_start_position..].chars() { + if !configuration.link_trail_character_set.contains(&character) { + break; + } + trail_end_position += character.len_utf8(); + } + let target_text = crate::Node::Text { + end: target_end_position, + start: target_start_position, + value: &state.wiki_text[target_start_position..target_end_position], + }; + let text = if trail_end_position > trail_start_position { + vec![ + target_text, + crate::Node::Text { + end: trail_end_position, + start: trail_start_position, + value: &state.wiki_text[trail_start_position..trail_end_position], + }, + ] + } else { + vec![target_text] + }; + state.nodes.push(crate::Node::Link { + end: trail_end_position, + start: state.scan_position, + target: &state.wiki_text[target_start_position..target_end_position].trim_right(), + text, + }); + } + } + state.flushed_position = trail_end_position; + state.scan_position = trail_end_position; +} + +fn parse_unexpected_end(state: &mut crate::State, target_end_position: usize) { + state.warnings.push(crate::Warning { + end: target_end_position, + message: crate::WarningMessage::InvalidLinkSyntax, + start: state.scan_position, + }); + state.scan_position += 1; +} diff --git a/parse_wiki_text/src/list.rs b/parse_wiki_text/src/list.rs new file mode 100644 index 0000000..653c4df --- /dev/null +++ b/parse_wiki_text/src/list.rs @@ -0,0 +1,221 @@ +// Copyright 2019 Fredrik Portström <https://portstrom.com> +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +pub fn parse_list_end_of_line(state: &mut crate::State) { + let item_end_position = state.skip_whitespace_backwards(state.scan_position); + state.flush(item_end_position); + state.scan_position += 1; + let mut level = 0; + for open_node in &state.stack { + match open_node.type_ { + crate::OpenNodeType::Table { .. } | crate::OpenNodeType::Tag { .. } => level += 1, + _ => break, + } + } + let start_level = level; + let mut term_level = None; + while level < state.stack.len() { + match ( + &state.stack[level].type_, + state.get_byte(state.scan_position), + ) { + (crate::OpenNodeType::DefinitionList { .. }, Some(b':')) + | (crate::OpenNodeType::OrderedList { .. }, Some(b'#')) + | (crate::OpenNodeType::UnorderedList { .. }, Some(b'*')) => { + level += 1; + state.scan_position += 1; + } + (crate::OpenNodeType::DefinitionList { .. }, Some(b';')) => { + if term_level.is_none() { + term_level = Some(level); + } + level += 1; + state.scan_position += 1; + } + _ => break, + } + } + if let Some(term_level) = term_level { + if level < state.stack.len() + || match state.get_byte(state.scan_position) { + Some(b'#') | Some(b'*') | Some(b':') | Some(b';') => true, + _ => false, + } + { + state.scan_position -= level - term_level; + level = term_level; + state.warnings.push(crate::Warning { + end: state.scan_position, + message: crate::WarningMessage::DefinitionTermContinuation, + start: state.scan_position - 1, + }); + } + } + while level < state.stack.len() { + let open_node = state.stack.pop().unwrap(); + let node = match open_node.type_ { + crate::OpenNodeType::DefinitionList { mut items } => { + { + let item_index = items.len() - 1; + let last_item = &mut items[item_index]; + last_item.end = item_end_position; + last_item.nodes = std::mem::replace(&mut state.nodes, open_node.nodes); + } + crate::Node::DefinitionList { + end: item_end_position, + items, + start: open_node.start, + } + } + crate::OpenNodeType::OrderedList { mut items } => { + { + let item_index = items.len() - 1; + let last_item = &mut items[item_index]; + last_item.end = item_end_position; + last_item.nodes = std::mem::replace(&mut state.nodes, open_node.nodes); + } + crate::Node::OrderedList { + end: item_end_position, + items, + start: open_node.start, + } + } + crate::OpenNodeType::UnorderedList { mut items } => { + { + let item_index = items.len() - 1; + let last_item = &mut items[item_index]; + last_item.end = item_end_position; + last_item.nodes = std::mem::replace(&mut state.nodes, open_node.nodes); + } + crate::Node::UnorderedList { + end: item_end_position, + items, + start: open_node.start, + } + } + _ => unreachable!(), + }; + state.nodes.push(node); + } + state.flushed_position = state.scan_position; + if parse_list_item_start(state) { + while parse_list_item_start(state) {} + skip_spaces(state); + } else if level > start_level { + match state.stack.get_mut(level - 1) { + Some(crate::OpenNode { + type_: crate::OpenNodeType::DefinitionList { items }, + .. + }) => { + { + let item_index = items.len() - 1; + let last_item = &mut items[item_index]; + last_item.end = item_end_position; + last_item.nodes = std::mem::replace(&mut state.nodes, vec![]); + } + items.push(crate::DefinitionListItem { + end: 0, + nodes: vec![], + start: state.scan_position - 1, + type_: if state + .wiki_text + .as_bytes() + .get(state.scan_position - 1) + .cloned() + == Some(b';') + { + crate::DefinitionListItemType::Term + } else { + crate::DefinitionListItemType::Details + }, + }); + } + Some(crate::OpenNode { + type_: crate::OpenNodeType::OrderedList { items }, + .. + }) => { + { + let item_index = items.len() - 1; + let last_item = &mut items[item_index]; + last_item.end = item_end_position; + last_item.nodes = std::mem::replace(&mut state.nodes, vec![]); + }; + items.push(crate::ListItem { + end: 0, + nodes: vec![], + start: state.scan_position - 1, + }); + } + Some(crate::OpenNode { + type_: crate::OpenNodeType::UnorderedList { items }, + .. + }) => { + { + let item_index = items.len() - 1; + let last_item = &mut items[item_index]; + last_item.end = item_end_position; + last_item.nodes = std::mem::replace(&mut state.nodes, vec![]); + }; + items.push(crate::ListItem { + end: 0, + nodes: vec![], + start: state.scan_position - 1, + }); + } + _ => unreachable!(), + } + skip_spaces(state); + } else { + state.skip_empty_lines(); + } +} + +pub fn parse_list_item_start(state: &mut crate::State) -> bool { + let open_node_type = match state.get_byte(state.scan_position) { + Some(b'#') => crate::OpenNodeType::OrderedList { + items: vec![crate::ListItem { + end: 0, + nodes: vec![], + start: state.scan_position + 1, + }], + }, + Some(b'*') => crate::OpenNodeType::UnorderedList { + items: vec![crate::ListItem { + end: 0, + nodes: vec![], + start: state.scan_position + 1, + }], + }, + Some(b':') => crate::OpenNodeType::DefinitionList { + items: vec![crate::DefinitionListItem { + end: 0, + nodes: vec![], + start: state.scan_position + 1, + type_: crate::DefinitionListItemType::Details, + }], + }, + Some(b';') => crate::OpenNodeType::DefinitionList { + items: vec![crate::DefinitionListItem { + end: 0, + nodes: vec![], + start: state.scan_position + 1, + type_: crate::DefinitionListItemType::Term, + }], + }, + _ => return false, + }; + let position = state.scan_position + 1; + state.push_open_node(open_node_type, position); + true +} + +pub fn skip_spaces(state: &mut crate::State) { + while match state.get_byte(state.scan_position) { + Some(b'\t') | Some(b' ') => true, + _ => false, + } { + state.scan_position += 1; + } + state.flushed_position = state.scan_position; +} diff --git a/parse_wiki_text/src/magic_word.rs b/parse_wiki_text/src/magic_word.rs new file mode 100644 index 0000000..40cab17 --- /dev/null +++ b/parse_wiki_text/src/magic_word.rs @@ -0,0 +1,26 @@ +// Copyright 2019 Fredrik Portström <https://portstrom.com> +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +pub fn parse_magic_word(state: &mut crate::State, configuration: &crate::Configuration) { + if let Ok((match_length, _)) = configuration + .magic_words + .find(&state.wiki_text[state.scan_position + 2..]) + { + let end_position = match_length + state.scan_position + 2; + if state.get_byte(end_position) == Some(b'_') + && state.get_byte(end_position + 1) == Some(b'_') + { + let scan_position = state.scan_position; + state.flush(scan_position); + state.flushed_position = end_position + 2; + state.nodes.push(crate::Node::MagicWord { + end: state.flushed_position, + start: state.scan_position, + }); + state.scan_position = state.flushed_position; + return; + } + } + state.scan_position += 1; +} diff --git a/parse_wiki_text/src/parse.rs b/parse_wiki_text/src/parse.rs new file mode 100644 index 0000000..33314bb --- /dev/null +++ b/parse_wiki_text/src/parse.rs @@ -0,0 +1,208 @@ +// Copyright 2019 Fredrik Portström <https://portstrom.com> +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +#[must_use] +pub fn parse<'a>(configuration: &crate::Configuration, wiki_text: &'a str) -> crate::Output<'a> { + let mut state = crate::State { + flushed_position: 0, + nodes: vec![], + scan_position: 0, + stack: vec![], + warnings: vec![], + wiki_text, + }; + { + let mut has_line_break = false; + let mut position = 0; + loop { + match state.get_byte(position) { + Some(b'\n') => { + if has_line_break { + state.warnings.push(crate::Warning { + end: position + 1, + message: crate::WarningMessage::RepeatedEmptyLine, + start: position, + }); + } + has_line_break = true; + position += 1; + state.flushed_position = position; + state.scan_position = position; + } + Some(b' ') => position += 1, + Some(b'#') => { + crate::redirect::parse_redirect(&mut state, configuration, position); + break; + } + _ => break, + } + } + } + crate::line::parse_beginning_of_line(&mut state, None); + loop { + match state.get_byte(state.scan_position) { + None => { + crate::line::parse_end_of_line(&mut state); + if state.scan_position < state.wiki_text.len() { + continue; + } + if let Some(crate::OpenNode { nodes, start, .. }) = state.stack.pop() { + state.warnings.push(crate::Warning { + end: state.scan_position, + message: crate::WarningMessage::MissingEndTagRewinding, + start, + }); + state.rewind(nodes, start); + } else { + break; + } + } + Some(0) | Some(1) | Some(2) | Some(3) | Some(4) | Some(5) | Some(6) | Some(7) + | Some(8) | Some(11) | Some(12) | Some(13) | Some(14) | Some(15) | Some(16) + | Some(17) | Some(18) | Some(19) | Some(20) | Some(21) | Some(22) | Some(23) + | Some(24) | Some(25) | Some(26) | Some(27) | Some(28) | Some(29) | Some(30) + | Some(31) | Some(127) => { + state.warnings.push(crate::Warning { + end: state.scan_position + 1, + message: crate::WarningMessage::InvalidCharacter, + start: state.scan_position, + }); + state.scan_position += 1; + } + Some(b'\n') => { + crate::line::parse_end_of_line(&mut state); + } + Some(b'!') + if state.get_byte(state.scan_position + 1) == Some(b'!') + && match state.stack.last() { + Some(crate::OpenNode { + type_: crate::OpenNodeType::Table(..), + .. + }) => true, + _ => false, + } => + { + crate::table::parse_heading_cell(&mut state); + } + Some(b'&') => { + crate::character_entity::parse_character_entity(&mut state, configuration) + } + Some(b'\'') => { + if state.get_byte(state.scan_position + 1) == Some(b'\'') { + crate::bold_italic::parse_bold_italic(&mut state); + } else { + state.scan_position += 1; + } + } + Some(b'<') => match state.get_byte(state.scan_position + 1) { + Some(b'!') + if state.get_byte(state.scan_position + 2) == Some(b'-') + && state.get_byte(state.scan_position + 3) == Some(b'-') => + { + crate::comment::parse_comment(&mut state) + } + Some(b'/') => crate::tag::parse_end_tag(&mut state, configuration), + _ => crate::tag::parse_start_tag(&mut state, configuration), + }, + Some(b'=') => { + crate::template::parse_parameter_name_end(&mut state); + } + Some(b'[') => { + if state.get_byte(state.scan_position + 1) == Some(b'[') { + crate::link::parse_link_start(&mut state, configuration); + } else { + crate::external_link::parse_external_link_start(&mut state, configuration); + } + } + Some(b']') => match state.stack.pop() { + None => state.scan_position += 1, + Some(crate::OpenNode { + nodes, + start, + type_: crate::OpenNodeType::ExternalLink, + }) => { + crate::external_link::parse_external_link_end(&mut state, start, nodes); + } + Some(crate::OpenNode { + nodes, + start, + type_: crate::OpenNodeType::Link { namespace, target }, + }) => { + if state.get_byte(state.scan_position + 1) == Some(b']') { + crate::link::parse_link_end( + &mut state, + &configuration, + start, + nodes, + namespace, + target, + ); + } else { + state.scan_position += 1; + state.stack.push(crate::OpenNode { + nodes, + start, + type_: crate::OpenNodeType::Link { namespace, target }, + }); + } + } + Some(open_node) => { + state.scan_position += 1; + state.stack.push(open_node); + } + }, + Some(b'_') => { + if state.get_byte(state.scan_position + 1) == Some(b'_') { + crate::magic_word::parse_magic_word(&mut state, configuration); + } else { + state.scan_position += 1; + } + } + Some(b'{') => { + if state.get_byte(state.scan_position + 1) == Some(b'{') { + crate::template::parse_template_start(&mut state); + } else { + state.scan_position += 1; + } + } + Some(b'|') => match state.stack.last_mut() { + Some(crate::OpenNode { + type_: crate::OpenNodeType::Parameter { default: None, .. }, + .. + }) => { + crate::template::parse_parameter_separator(&mut state); + } + Some(crate::OpenNode { + type_: crate::OpenNodeType::Table(..), + .. + }) => { + crate::table::parse_inline_token(&mut state); + } + Some(crate::OpenNode { + type_: crate::OpenNodeType::Template { .. }, + .. + }) => { + crate::template::parse_template_separator(&mut state); + } + _ => state.scan_position += 1, + }, + Some(b'}') => { + if state.get_byte(state.scan_position + 1) == Some(b'}') { + crate::template::parse_template_end(&mut state); + } else { + state.scan_position += 1; + } + } + _ => { + state.scan_position += 1; + } + } + } + let end_position = state.skip_whitespace_backwards(wiki_text.len()); + state.flush(end_position); + crate::Output { + nodes: state.nodes, + warnings: state.warnings, + } +} diff --git a/parse_wiki_text/src/positioned.rs b/parse_wiki_text/src/positioned.rs new file mode 100644 index 0000000..d2793fa --- /dev/null +++ b/parse_wiki_text/src/positioned.rs @@ -0,0 +1,86 @@ +// Copyright 2019 Fredrik Portström <https://portstrom.com> +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +macro_rules! impl_positioned { + ($type:tt) => { + impl<'a> crate::Positioned for crate::$type<'a> { + fn end(&self) -> usize { + self.end + } + + fn start(&self) -> usize { + self.start + } + } + }; +} + +impl_positioned!(DefinitionListItem); +impl_positioned!(ListItem); +impl_positioned!(Parameter); +impl_positioned!(TableCaption); +impl_positioned!(TableCell); +impl_positioned!(TableRow); + +impl<'a> crate::Positioned for crate::Node<'a> { + fn end(&self) -> usize { + match *self { + crate::Node::Bold { end, .. } => end, + crate::Node::BoldItalic { end, .. } => end, + crate::Node::Category { end, .. } => end, + crate::Node::CharacterEntity { end, .. } => end, + crate::Node::Comment { end, .. } => end, + crate::Node::DefinitionList { end, .. } => end, + crate::Node::EndTag { end, .. } => end, + crate::Node::ExternalLink { end, .. } => end, + crate::Node::Heading { end, .. } => end, + crate::Node::HorizontalDivider { end, .. } => end, + crate::Node::Image { end, .. } => end, + crate::Node::Italic { end, .. } => end, + crate::Node::Link { end, .. } => end, + crate::Node::MagicWord { end, .. } => end, + crate::Node::OrderedList { end, .. } => end, + crate::Node::ParagraphBreak { end, .. } => end, + crate::Node::Parameter { end, .. } => end, + crate::Node::Preformatted { end, .. } => end, + crate::Node::Redirect { end, .. } => end, + crate::Node::StartTag { end, .. } => end, + crate::Node::Table { end, .. } => end, + crate::Node::Tag { end, .. } => end, + crate::Node::Template { end, .. } => end, + crate::Node::Text { end, .. } => end, + crate::Node::UnorderedList { end, .. } => end, + } + } + + fn start(&self) -> usize { + match *self { + crate::Node::Bold { start, .. } => start, + crate::Node::BoldItalic { start, .. } => start, + crate::Node::Category { start, .. } => start, + crate::Node::CharacterEntity { start, .. } => start, + crate::Node::Comment { start, .. } => start, + crate::Node::DefinitionList { start, .. } => start, + crate::Node::EndTag { start, .. } => start, + crate::Node::ExternalLink { start, .. } => start, + crate::Node::Heading { start, .. } => start, + crate::Node::HorizontalDivider { start, .. } => start, + crate::Node::Image { start, .. } => start, + crate::Node::Italic { start, .. } => start, + crate::Node::Link { start, .. } => start, + crate::Node::MagicWord { start, .. } => start, + crate::Node::OrderedList { start, .. } => start, + crate::Node::ParagraphBreak { start, .. } => start, + crate::Node::Parameter { start, .. } => start, + crate::Node::Preformatted { start, .. } => start, + crate::Node::Redirect { start, .. } => start, + crate::Node::StartTag { start, .. } => start, + crate::Node::Table { start, .. } => start, + crate::Node::Tag { start, .. } => start, + crate::Node::Template { start, .. } => start, + crate::Node::Text { start, .. } => start, + crate::Node::UnorderedList { start, .. } => start, + } + } +} diff --git a/parse_wiki_text/src/redirect.rs b/parse_wiki_text/src/redirect.rs new file mode 100644 index 0000000..6e3f4ec --- /dev/null +++ b/parse_wiki_text/src/redirect.rs @@ -0,0 +1,86 @@ +// Copyright 2019 Fredrik Portström <https://portstrom.com> +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +pub fn parse_redirect( + state: &mut crate::State, + configuration: &crate::Configuration, + start_position: usize, +) { + let mut position = match configuration + .redirect_magic_words + .find(&state.wiki_text[start_position + 1..]) + { + Err(_) => return, + Ok((match_length, _)) => match_length + start_position + 1, + }; + loop { + match state.get_byte(position) { + Some(b'\t') | Some(b'\n') | Some(b' ') => position += 1, + Some(b':') => { + position += 1; + loop { + match state.get_byte(position) { + Some(b'\t') | Some(b'\n') | Some(b' ') => position += 1, + Some(b'[') => break, + _ => return, + } + } + break; + } + Some(b'[') => break, + _ => return, + } + } + if state.get_byte(position + 1) != Some(b'[') { + return; + } + position += 2; + let target_end_position; + let target_start_position = position; + loop { + match state.get_byte(position) { + None | Some(b'\n') | Some(b'[') | Some(b'{') | Some(b'}') => return, + Some(b']') => { + target_end_position = position; + break; + } + Some(b'|') => { + state.warnings.push(crate::Warning { + end: position + 1, + message: crate::WarningMessage::UselessTextInRedirect, + start: position, + }); + target_end_position = position; + position += 1; + loop { + match state.get_byte(position) { + None | Some(b'\n') => return, + Some(b']') => break, + Some(_) => position += 1, + } + } + break; + } + Some(_) => position += 1, + } + } + if state.get_byte(position + 1) == Some(b']') { + position += 2; + state.nodes.push(crate::Node::Redirect { + end: position, + start: start_position, + target: &state.wiki_text[target_start_position..target_end_position], + }); + state.flushed_position = state.skip_whitespace_forwards(position); + state.scan_position = state.flushed_position; + if state.wiki_text.len() > position { + state.warnings.push(crate::Warning { + end: state.wiki_text.len(), + message: crate::WarningMessage::TextAfterRedirect, + start: start_position, + }); + } + return; + } +} diff --git a/parse_wiki_text/src/state.rs b/parse_wiki_text/src/state.rs new file mode 100644 index 0000000..34d7a9c --- /dev/null +++ b/parse_wiki_text/src/state.rs @@ -0,0 +1,174 @@ +// Copyright 2019 Fredrik Portström <https://portstrom.com> +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +pub struct OpenNode<'a> { + pub nodes: Vec<crate::Node<'a>>, + pub start: usize, + pub type_: OpenNodeType<'a>, +} + +pub enum OpenNodeType<'a> { + DefinitionList { + items: Vec<crate::DefinitionListItem<'a>>, + }, + ExternalLink, + Heading { + level: u8, + }, + Link { + namespace: Option<crate::Namespace>, + target: &'a str, + }, + OrderedList { + items: Vec<crate::ListItem<'a>>, + }, + Parameter { + default: Option<Vec<crate::Node<'a>>>, + name: Option<Vec<crate::Node<'a>>>, + }, + Preformatted, + Table(Table<'a>), + Tag { + name: crate::Cow<'a, str>, + }, + Template { + name: Option<Vec<crate::Node<'a>>>, + parameters: Vec<crate::Parameter<'a>>, + }, + UnorderedList { + items: Vec<crate::ListItem<'a>>, + }, +} + +pub struct State<'a> { + pub flushed_position: usize, + pub nodes: Vec<crate::Node<'a>>, + pub scan_position: usize, + pub stack: Vec<OpenNode<'a>>, + pub warnings: Vec<crate::Warning>, + pub wiki_text: &'a str, +} + +pub struct Table<'a> { + pub attributes: Vec<crate::Node<'a>>, + pub before: Vec<crate::Node<'a>>, + pub captions: Vec<crate::TableCaption<'a>>, + pub child_element_attributes: Option<Vec<crate::Node<'a>>>, + pub rows: Vec<crate::TableRow<'a>>, + pub start: usize, + pub state: TableState, +} + +pub enum TableState { + Before, + CaptionFirstLine, + CaptionRemainder, + CellFirstLine, + CellRemainder, + HeadingFirstLine, + HeadingRemainder, + Row, + TableAttributes, +} + +impl<'a> State<'a> { + pub fn flush(&mut self, end_position: usize) { + flush( + &mut self.nodes, + self.flushed_position, + end_position, + self.wiki_text, + ); + } + + pub fn get_byte(&self, position: usize) -> Option<u8> { + self.wiki_text.as_bytes().get(position).cloned() + } + + pub fn push_open_node(&mut self, type_: OpenNodeType<'a>, inner_start_position: usize) { + let scan_position = self.scan_position; + self.flush(scan_position); + self.stack.push(OpenNode { + nodes: std::mem::replace(&mut self.nodes, vec![]), + start: scan_position, + type_, + }); + self.scan_position = inner_start_position; + self.flushed_position = inner_start_position; + } + + pub fn rewind(&mut self, nodes: Vec<crate::Node<'a>>, position: usize) { + self.scan_position = position + 1; + self.nodes = nodes; + if let Some(position_before_text) = match self.nodes.last() { + Some(crate::Node::Text { start, .. }) => Some(*start), + _ => None, + } { + self.nodes.pop(); + self.flushed_position = position_before_text; + } else { + self.flushed_position = position; + } + } + + pub fn skip_empty_lines(&mut self) { + match self.stack.last() { + Some(OpenNode { + type_: OpenNodeType::Table { .. }, + .. + }) => { + self.scan_position -= 1; + crate::table::parse_table_end_of_line(self, false); + } + _ => { + crate::line::parse_beginning_of_line(self, None); + } + } + } + + pub fn skip_whitespace_backwards(&self, position: usize) -> usize { + skip_whitespace_backwards(self.wiki_text, position) + } + + pub fn skip_whitespace_forwards(&self, position: usize) -> usize { + skip_whitespace_forwards(self.wiki_text, position) + } +} + +pub fn flush<'a>( + nodes: &mut Vec<crate::Node<'a>>, + flushed_position: usize, + end_position: usize, + wiki_text: &'a str, +) { + if end_position > flushed_position { + nodes.push(crate::Node::Text { + end: end_position, + start: flushed_position, + value: &wiki_text[flushed_position..end_position], + }); + } +} + +pub fn skip_whitespace_backwards(wiki_text: &str, mut position: usize) -> usize { + while position > 0 + && match wiki_text.as_bytes()[position - 1] { + b'\t' | b'\n' | b' ' => true, + _ => false, + } + { + position -= 1; + } + position +} + +pub fn skip_whitespace_forwards(wiki_text: &str, mut position: usize) -> usize { + while match wiki_text.as_bytes().get(position).cloned() { + Some(b'\t') | Some(b'\n') | Some(b' ') => true, + _ => false, + } { + position += 1; + } + position +} diff --git a/parse_wiki_text/src/table.rs b/parse_wiki_text/src/table.rs new file mode 100644 index 0000000..00062d6 --- /dev/null +++ b/parse_wiki_text/src/table.rs @@ -0,0 +1,631 @@ +// Copyright 2019 Fredrik Portström <https://portstrom.com> +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +use crate::state::TableState; + +pub fn parse_heading_cell(state: &mut crate::State) { + let table = get_table(&mut state.stack); + let position_before_token = state.scan_position; + if let crate::state::TableState::HeadingFirstLine = table.state { + let end = crate::state::skip_whitespace_backwards(state.wiki_text, position_before_token); + crate::state::flush( + &mut state.nodes, + state.flushed_position, + end, + state.wiki_text, + ); + if table.rows.is_empty() { + table.rows.push(crate::TableRow { + attributes: vec![], + cells: vec![], + end, + start: table.start, + }); + } + let row = table.rows.last_mut().unwrap(); + row.cells.push(crate::TableCell { + attributes: table.child_element_attributes.take(), + content: std::mem::replace(&mut state.nodes, vec![]), + end, + start: table.start, + type_: crate::TableCellType::Heading, + }); + row.end = end; + table.start = position_before_token; + state.scan_position = position_before_token + 2; + while let Some(character) = state.wiki_text.as_bytes().get(state.scan_position) { + match character { + b'\t' | b' ' => state.scan_position += 1, + _ => break, + } + } + state.flushed_position = state.scan_position; + } else { + state.scan_position += 2; + } +} + +pub fn parse_table_end_of_line(state: &mut crate::State, paragraph_break_possible: bool) { + let position_before_line_break = state.scan_position; + let mut position_after_line_break = position_before_line_break + 1; + let mut scan_position = position_after_line_break; + loop { + match state.get_byte(scan_position) { + Some(b'\n') => { + scan_position += 1; + position_after_line_break = scan_position; + } + Some(b'\t') | Some(b' ') => scan_position += 1, + Some(b'!') => { + change_state( + state, + TableState::HeadingFirstLine, + position_before_line_break, + scan_position, + scan_position + 1, + paragraph_break_possible, + ); + break; + } + Some(b'|') => { + match state.get_byte(scan_position + 1) { + Some(b'+') => change_state( + state, + TableState::CaptionFirstLine, + position_before_line_break, + scan_position, + scan_position + 2, + paragraph_break_possible, + ), + Some(b'-') => change_state( + state, + TableState::Row, + position_before_line_break, + scan_position, + scan_position + 2, + paragraph_break_possible, + ), + Some(b'}') => parse_end( + state, + position_before_line_break, + scan_position + 2, + paragraph_break_possible, + ), + _ => change_state( + state, + TableState::CellFirstLine, + position_before_line_break, + scan_position, + scan_position + 1, + paragraph_break_possible, + ), + } + break; + } + _ => { + parse_line_break( + state, + position_before_line_break, + position_after_line_break, + scan_position, + paragraph_break_possible, + ); + break; + } + } + } +} + +fn change_state( + state: &mut crate::State, + target_table_state: TableState, + position_before_line_break: usize, + position_before_token: usize, + mut position_after_token: usize, + paragraph_break_possible: bool, +) { + while let Some(character) = state.get_byte(position_after_token) { + match character { + b'\t' | b' ' => position_after_token += 1, + _ => break, + } + } + let table = get_table(&mut state.stack); + let end = crate::state::skip_whitespace_backwards(state.wiki_text, position_before_line_break); + if paragraph_break_possible { + crate::state::flush( + &mut state.nodes, + state.flushed_position, + end, + state.wiki_text, + ); + } + match table.state { + TableState::Before => { + state.warnings.push(crate::Warning { + end: position_before_line_break, + message: crate::WarningMessage::StrayTextInTable, + start: table.start, + }); + table + .before + .append(&mut std::mem::replace(&mut state.nodes, vec![])); + } + TableState::CaptionFirstLine | TableState::CaptionRemainder => { + table.captions.push(crate::TableCaption { + attributes: table.child_element_attributes.take(), + content: std::mem::replace(&mut state.nodes, vec![]), + end, + start: table.start, + }); + } + TableState::CellFirstLine | TableState::CellRemainder => { + if table.rows.is_empty() { + table.rows.push(crate::TableRow { + attributes: vec![], + cells: vec![], + end, + start: table.start, + }); + } + let row = table.rows.last_mut().unwrap(); + row.cells.push(crate::TableCell { + attributes: table.child_element_attributes.take(), + content: std::mem::replace(&mut state.nodes, vec![]), + end, + start: table.start, + type_: crate::TableCellType::Ordinary, + }); + row.end = end; + } + TableState::HeadingFirstLine | TableState::HeadingRemainder => { + if table.rows.is_empty() { + table.rows.push(crate::TableRow { + attributes: vec![], + cells: vec![], + end, + start: table.start, + }); + } + let row = table.rows.last_mut().unwrap(); + row.cells.push(crate::TableCell { + attributes: table.child_element_attributes.take(), + content: std::mem::replace(&mut state.nodes, vec![]), + end, + start: table.start, + type_: crate::TableCellType::Heading, + }); + row.end = position_before_line_break; + } + TableState::Row => { + table.rows.push(crate::TableRow { + attributes: std::mem::replace(&mut state.nodes, vec![]), + cells: vec![], + end, + start: table.start, + }); + } + TableState::TableAttributes => { + table.attributes = std::mem::replace(&mut state.nodes, vec![]); + } + } + table.start = position_before_token; + table.state = target_table_state; + state.flushed_position = position_after_token; + state.scan_position = position_after_token; +} + +fn parse_end( + state: &mut crate::State, + position_before_line_break: usize, + position_after_token: usize, + paragraph_break_possible: bool, +) { + let open_node = state.stack.pop().unwrap(); + match open_node.type_ { + crate::OpenNodeType::Table(crate::state::Table { + mut attributes, + mut before, + mut captions, + mut child_element_attributes, + mut rows, + start, + state: table_state, + }) => { + if paragraph_break_possible { + state.flush(crate::state::skip_whitespace_backwards( + state.wiki_text, + position_before_line_break, + )); + } + match table_state { + TableState::Before => { + state.warnings.push(crate::Warning { + end: position_before_line_break, + message: crate::WarningMessage::StrayTextInTable, + start, + }); + before.append(&mut std::mem::replace(&mut state.nodes, open_node.nodes)); + } + TableState::CaptionFirstLine | TableState::CaptionRemainder => { + captions.push(crate::TableCaption { + attributes: child_element_attributes.take(), + content: std::mem::replace(&mut state.nodes, open_node.nodes), + end: position_before_line_break, + start, + }); + } + TableState::CellFirstLine | TableState::CellRemainder => { + if rows.is_empty() { + rows.push(crate::TableRow { + attributes: vec![], + cells: vec![], + end: 0, + start, + }); + } + let row = rows.last_mut().unwrap(); + row.cells.push(crate::TableCell { + attributes: child_element_attributes.take(), + content: std::mem::replace(&mut state.nodes, open_node.nodes), + end: position_before_line_break, + start, + type_: crate::TableCellType::Ordinary, + }); + row.end = position_before_line_break; + } + TableState::HeadingFirstLine | TableState::HeadingRemainder => { + if rows.is_empty() { + rows.push(crate::TableRow { + attributes: vec![], + cells: vec![], + end: 0, + start, + }); + } + let row = rows.last_mut().unwrap(); + row.cells.push(crate::TableCell { + attributes: child_element_attributes.take(), + content: std::mem::replace(&mut state.nodes, open_node.nodes), + end: position_before_line_break, + start, + type_: crate::TableCellType::Heading, + }); + row.end = position_before_line_break; + } + TableState::Row => { + rows.push(crate::TableRow { + attributes: std::mem::replace(&mut state.nodes, open_node.nodes), + cells: vec![], + end: position_before_line_break, + start, + }); + } + TableState::TableAttributes => { + attributes = std::mem::replace(&mut state.nodes, open_node.nodes); + } + } + state.scan_position = position_after_token; + state.nodes.append(&mut before); + state.nodes.push(crate::Node::Table { + attributes, + captions, + end: state.scan_position, + rows, + start: open_node.start, + }); + while let Some(character) = state.get_byte(state.scan_position) { + match character { + b'\t' | b' ' => state.scan_position += 1, + b'\n' => { + state.scan_position += 1; + state.skip_empty_lines(); + break; + } + _ => break, + } + } + state.flushed_position = state.scan_position; + } + _ => unreachable!(), + } +} + +fn parse_line_break( + state: &mut crate::State, + position_before_line_break: usize, + position_after_line_break: usize, + position_after_token: usize, + paragraph_break_possible: bool, +) { + { + let table = get_table(&mut state.stack); + match table.state { + TableState::Before | TableState::CaptionRemainder => { + state.scan_position = position_after_token + } + TableState::CaptionFirstLine => { + table.state = TableState::CaptionRemainder; + if state.nodes.is_empty() && state.flushed_position == position_before_line_break { + state.flushed_position = position_after_token; + } + state.scan_position = position_after_token; + if position_after_token != position_after_line_break { + return; + } + } + TableState::CellFirstLine => { + crate::state::flush( + &mut state.nodes, + state.flushed_position, + crate::state::skip_whitespace_backwards( + state.wiki_text, + position_before_line_break, + ), + state.wiki_text, + ); + state.nodes.push(crate::Node::ParagraphBreak { + end: position_after_line_break, + start: position_before_line_break, + }); + table.start = position_after_line_break; + table.state = TableState::CellRemainder; + state.flushed_position = position_after_line_break; + state.scan_position = position_after_line_break; + } + TableState::HeadingFirstLine => { + crate::state::flush( + &mut state.nodes, + state.flushed_position, + crate::state::skip_whitespace_backwards( + state.wiki_text, + position_before_line_break, + ), + state.wiki_text, + ); + state.nodes.push(crate::Node::ParagraphBreak { + end: position_after_line_break, + start: position_before_line_break, + }); + table.start = position_after_line_break; + table.state = TableState::HeadingRemainder; + state.flushed_position = position_after_line_break; + state.scan_position = position_after_line_break; + } + TableState::CellRemainder | TableState::HeadingRemainder => { + state.scan_position = position_before_line_break + 1 + } + TableState::TableAttributes => { + crate::state::flush( + &mut state.nodes, + state.flushed_position, + crate::state::skip_whitespace_backwards( + state.wiki_text, + position_before_line_break, + ), + state.wiki_text, + ); + table.attributes = std::mem::replace(&mut state.nodes, vec![]); + table.start = position_after_token; + table.state = TableState::Before; + state.flushed_position = position_after_token; + state.scan_position = position_after_token; + if position_after_token != position_after_line_break { + return; + } + } + TableState::Row => { + crate::state::flush( + &mut state.nodes, + state.flushed_position, + crate::state::skip_whitespace_backwards( + state.wiki_text, + position_before_line_break, + ), + state.wiki_text, + ); + table.rows.push(crate::TableRow { + attributes: std::mem::replace(&mut state.nodes, vec![]), + cells: vec![], + end: position_before_line_break, + start: table.start, + }); + table.start = position_after_token; + table.state = TableState::Before; + state.flushed_position = position_after_token; + state.scan_position = position_after_token; + if position_after_token != position_after_line_break { + return; + } + } + } + } + crate::line::parse_beginning_of_line( + state, + if paragraph_break_possible { + Some(position_before_line_break) + } else { + None + }, + ); +} + +pub fn parse_inline_token(state: &mut crate::State) { + let table = get_table(&mut state.stack); + let position_before_token = state.scan_position; + if state + .wiki_text + .as_bytes() + .get(position_before_token + 1) + .cloned() + == Some(b'|') + { + match table.state { + crate::state::TableState::CaptionFirstLine => { + let end = + crate::state::skip_whitespace_backwards(state.wiki_text, position_before_token); + crate::state::flush( + &mut state.nodes, + state.flushed_position, + end, + state.wiki_text, + ); + table.captions.push(crate::TableCaption { + attributes: table.child_element_attributes.take(), + content: std::mem::replace(&mut state.nodes, vec![]), + end, + start: table.start, + }); + table.start = position_before_token; + state.scan_position = position_before_token + 2; + while let Some(character) = state.wiki_text.as_bytes().get(state.scan_position) { + match character { + b'\t' | b' ' => state.scan_position += 1, + _ => break, + } + } + state.flushed_position = state.scan_position; + } + crate::state::TableState::CellFirstLine => { + let end = + crate::state::skip_whitespace_backwards(state.wiki_text, position_before_token); + crate::state::flush( + &mut state.nodes, + state.flushed_position, + end, + state.wiki_text, + ); + if table.rows.is_empty() { + table.rows.push(crate::TableRow { + attributes: vec![], + cells: vec![], + end, + start: table.start, + }); + } + let row = table.rows.last_mut().unwrap(); + row.cells.push(crate::TableCell { + attributes: table.child_element_attributes.take(), + content: std::mem::replace(&mut state.nodes, vec![]), + end, + start: table.start, + type_: crate::TableCellType::Ordinary, + }); + row.end = end; + table.start = position_before_token; + state.scan_position = position_before_token + 2; + while let Some(character) = state.wiki_text.as_bytes().get(state.scan_position) { + match character { + b'\t' | b' ' => state.scan_position += 1, + _ => break, + } + } + state.flushed_position = state.scan_position; + } + crate::state::TableState::HeadingFirstLine => { + let end = + crate::state::skip_whitespace_backwards(state.wiki_text, position_before_token); + crate::state::flush( + &mut state.nodes, + state.flushed_position, + end, + state.wiki_text, + ); + if table.rows.is_empty() { + table.rows.push(crate::TableRow { + attributes: vec![], + cells: vec![], + end, + start: table.start, + }); + } + let row = table.rows.last_mut().unwrap(); + row.cells.push(crate::TableCell { + attributes: table.child_element_attributes.take(), + content: std::mem::replace(&mut state.nodes, vec![]), + end, + start: table.start, + type_: crate::TableCellType::Heading, + }); + row.end = end; + table.start = position_before_token; + state.scan_position = position_before_token + 2; + while let Some(character) = state.wiki_text.as_bytes().get(state.scan_position) { + match character { + b'\t' | b' ' => state.scan_position += 1, + _ => break, + } + } + state.flushed_position = state.scan_position; + } + _ => state.scan_position += 2, + } + } else { + match table.state { + crate::state::TableState::CaptionFirstLine + | crate::state::TableState::CellFirstLine + | crate::state::TableState::HeadingFirstLine + if table.child_element_attributes.is_none() => + { + crate::state::flush( + &mut state.nodes, + state.flushed_position, + crate::state::skip_whitespace_backwards(state.wiki_text, position_before_token), + state.wiki_text, + ); + table.child_element_attributes = Some(std::mem::replace(&mut state.nodes, vec![])); + state.scan_position = position_before_token + 1; + while let Some(character) = state.wiki_text.as_bytes().get(state.scan_position) { + match character { + b'\t' | b' ' => state.scan_position += 1, + _ => break, + } + } + state.flushed_position = state.scan_position; + } + _ => state.scan_position += 1, + } + } +} + +pub fn start_table(state: &mut crate::State, position_before_line_break: Option<usize>) { + if let Some(position) = position_before_line_break { + crate::state::flush( + &mut state.nodes, + state.flushed_position, + crate::state::skip_whitespace_backwards(state.wiki_text, position), + state.wiki_text, + ); + } + state.flushed_position = state.scan_position; + let mut position = state.scan_position + 2; + loop { + match state.get_byte(position) { + Some(b'\t') | Some(b' ') => position += 1, + _ => break, + } + } + state.push_open_node( + crate::OpenNodeType::Table(crate::state::Table { + attributes: vec![], + before: vec![], + captions: vec![], + child_element_attributes: None, + rows: vec![], + start: 0, + state: crate::state::TableState::TableAttributes, + }), + position, + ); +} + +fn get_table<'a, 'b>(stack: &'a mut Vec<crate::OpenNode<'b>>) -> &'a mut crate::state::Table<'b> { + match stack.last_mut() { + Some(crate::OpenNode { + type_: crate::OpenNodeType::Table(table), + .. + }) => table, + _ => unreachable!(), + } +} diff --git a/parse_wiki_text/src/tag.rs b/parse_wiki_text/src/tag.rs new file mode 100644 index 0000000..3490ed3 --- /dev/null +++ b/parse_wiki_text/src/tag.rs @@ -0,0 +1,318 @@ +// Copyright 2019 Fredrik Portström <https://portstrom.com> +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +pub fn parse_end_tag(state: &mut crate::State, configuration: &crate::Configuration) { + let start_position = state.scan_position; + let tag_name_start_position = start_position + 2; + let mut tag_name_end_position = tag_name_start_position; + while let Some(character) = state.get_byte(tag_name_end_position) { + match character { + b'\t' | b'\n' | b' ' | b'/' | b'>' => break, + b'<' => { + state.scan_position += 1; + return; + } + _ => tag_name_end_position += 1, + } + } + let tag_name = &state.wiki_text[tag_name_start_position..tag_name_end_position]; + let tag_name = if tag_name.as_bytes().iter().all(u8::is_ascii_lowercase) { + crate::Cow::Borrowed(tag_name) + } else { + tag_name.to_ascii_lowercase().into() + }; + match configuration.tag_name_map.get(&tag_name as &str) { + None => { + state.scan_position = tag_name_start_position; + state.warnings.push(crate::Warning { + end: tag_name_end_position, + message: crate::WarningMessage::UnrecognizedTagName, + start: tag_name_start_position, + }); + } + Some(crate::TagClass::ExtensionTag) => { + let mut tag_end_position = tag_name_end_position; + loop { + match state.get_byte(tag_end_position) { + Some(b'>') => break, + Some(b'\t') | Some(b'\n') | Some(b' ') => tag_end_position += 1, + _ => { + state.scan_position = tag_name_start_position; + state.warnings.push(crate::Warning { + end: tag_end_position, + message: crate::WarningMessage::InvalidTagSyntax, + start: start_position, + }); + return; + } + } + } + let mut matched_node_index = None; + for (open_node_index, open_node) in state.stack.iter().enumerate().rev() { + if let crate::OpenNodeType::Tag { name, .. } = &open_node.type_ { + if name == &tag_name { + matched_node_index = Some(open_node_index); + break; + } + } + } + match matched_node_index { + None => { + state.scan_position = tag_name_start_position; + state.warnings.push(crate::Warning { + end: tag_name_end_position, + message: crate::WarningMessage::UnexpectedEndTag, + start: tag_name_start_position, + }); + } + Some(open_node_index) => { + if open_node_index < state.stack.len() - 1 { + state.warnings.push(crate::Warning { + end: tag_end_position, + message: crate::WarningMessage::MissingEndTagRewinding, + start: start_position, + }); + state.stack.truncate(open_node_index + 2); + let open_node = state.stack.pop().unwrap(); + state.rewind(open_node.nodes, open_node.start); + } else { + state.flush(start_position); + let open_node = state.stack.pop().unwrap(); + tag_end_position += 1; + state.flushed_position = tag_end_position; + state.scan_position = state.flushed_position; + let nodes = std::mem::replace(&mut state.nodes, open_node.nodes); + state.nodes.push(crate::Node::Tag { + end: state.scan_position, + name: tag_name, + nodes, + start: open_node.start, + }); + } + } + } + } + Some(crate::TagClass::Tag) => { + let mut tag_end_position = tag_name_end_position; + loop { + match state.get_byte(tag_end_position) { + None => { + state.scan_position = tag_name_start_position; + state.warnings.push(crate::Warning { + end: tag_name_end_position, + message: crate::WarningMessage::InvalidTagSyntax, + start: tag_name_start_position, + }); + return; + } + Some(b'>') => break, + _ => tag_end_position += 1, + } + } + state.flush(start_position); + state.flushed_position = tag_end_position + 1; + state.scan_position = state.flushed_position; + state.nodes.push(crate::Node::EndTag { + end: state.scan_position, + name: tag_name, + start: start_position, + }); + } + } +} + +pub fn parse_start_tag(state: &mut crate::State, configuration: &crate::Configuration) { + let start_position = state.scan_position; + let tag_name_start_position = start_position + 1; + let tag_name_end_position = match state.wiki_text.as_bytes()[tag_name_start_position..] + .iter() + .cloned() + .position(|character| match character { + b'\t' | b'\n' | b' ' | b'/' | b'>' => true, + _ => false, + }) { + None => state.wiki_text.len(), + Some(position) => tag_name_start_position + position, + }; + let tag_name = &state.wiki_text[tag_name_start_position..tag_name_end_position]; + let tag_name = if tag_name.as_bytes().iter().all(u8::is_ascii_lowercase) { + crate::Cow::Borrowed(tag_name) + } else { + tag_name.to_ascii_lowercase().into() + }; + match configuration.tag_name_map.get(&tag_name as &str) { + None => { + state.scan_position = tag_name_start_position; + state.warnings.push(crate::Warning { + end: tag_name_end_position, + message: crate::WarningMessage::UnrecognizedTagName, + start: tag_name_start_position, + }); + } + Some(tag_class) => match state.wiki_text.as_bytes()[tag_name_end_position..] + .iter() + .cloned() + .position(|character| character == b'>') + { + None => { + state.scan_position = tag_name_start_position; + state.warnings.push(crate::Warning { + end: tag_name_end_position, + message: crate::WarningMessage::InvalidTagSyntax, + start: state.scan_position, + }); + } + Some(tag_end_position) => { + let tag_end_position = tag_name_end_position + tag_end_position + 1; + match tag_class { + crate::TagClass::ExtensionTag => { + if state.get_byte(tag_end_position - 2) == Some(b'/') { + state.flush(start_position); + state.flushed_position = tag_end_position; + state.scan_position = state.flushed_position; + state.nodes.push(crate::Node::Tag { + end: tag_end_position, + name: tag_name, + nodes: vec![], + start: start_position, + }); + } else { + match &tag_name as _ { + "math" | "nowiki" => { + parse_plain_text_tag( + state, + start_position, + tag_end_position, + &tag_name, + ); + } + _ => { + state.push_open_node( + crate::OpenNodeType::Tag { name: tag_name }, + tag_end_position, + ); + } + } + } + } + crate::TagClass::Tag => { + state.flush(start_position); + state.flushed_position = tag_end_position; + state.scan_position = state.flushed_position; + state.nodes.push(crate::Node::StartTag { + end: tag_end_position, + name: tag_name, + start: start_position, + }); + } + } + } + }, + } +} + +fn parse_plain_text_tag<'a>( + state: &mut crate::State<'a>, + position_before_start_tag: usize, + position_after_start_tag: usize, + start_tag_name: &crate::Cow<'a, str>, +) { + loop { + match state.get_byte(state.scan_position) { + None => { + state.scan_position = position_before_start_tag + 1; + state.warnings.push(crate::Warning { + end: position_after_start_tag, + message: crate::WarningMessage::MissingEndTagRewinding, + start: position_before_start_tag, + }); + break; + } + Some(b'<') => { + if state.get_byte(state.scan_position + 1) == Some(b'/') + && parse_plain_text_end_tag( + state, + position_before_start_tag, + position_after_start_tag, + &start_tag_name, + ) + { + break; + } + } + _ => {} + } + state.scan_position += 1; + } +} + +fn parse_plain_text_end_tag<'a>( + state: &mut crate::State<'a>, + position_before_start_tag: usize, + position_after_start_tag: usize, + start_tag_name: &crate::Cow<'a, str>, +) -> bool { + let position_before_end_tag = state.scan_position; + let position_before_end_tag_name = state.scan_position + 2; + let mut position_after_end_tag_name = position_before_end_tag_name; + let position_after_end_tag = loop { + match state.get_byte(position_after_end_tag_name) { + None | Some(b'/') | Some(b'<') => return false, + Some(b'\t') | Some(b'\n') | Some(b' ') => { + let position_after_end_tag = + state.skip_whitespace_forwards(position_after_end_tag_name + 1); + match state.get_byte(position_after_end_tag) { + Some(b'>') => break position_after_end_tag, + _ => return false, + } + } + Some(b'>') => break position_after_end_tag_name, + _ => position_after_end_tag_name += 1, + } + } + 1; + let end_tag_name = &state.wiki_text[position_before_end_tag_name..position_after_end_tag_name]; + let end_tag_name = if end_tag_name.as_bytes().iter().all(u8::is_ascii_lowercase) { + crate::Cow::Borrowed(end_tag_name) + } else { + end_tag_name.to_ascii_lowercase().into() + }; + if *start_tag_name == end_tag_name { + let nodes = if position_after_start_tag < position_before_end_tag { + vec![crate::Node::Text { + end: position_before_end_tag, + start: position_after_start_tag, + value: &state.wiki_text[position_after_start_tag..position_before_end_tag], + }] + } else { + vec![] + }; + state.flushed_position = position_after_end_tag; + state.scan_position = position_after_end_tag; + state.nodes.push(crate::Node::Tag { + end: position_after_end_tag, + name: end_tag_name, + nodes, + start: position_before_start_tag, + }); + return true; + } + let mut found = false; + for open_node in &state.stack { + if let crate::OpenNodeType::Tag { name, .. } = &open_node.type_ { + if name == &end_tag_name { + found = true; + break; + } + } + } + if found { + state.warnings.push(crate::Warning { + end: position_before_end_tag, + message: crate::WarningMessage::MissingEndTagRewinding, + start: position_before_start_tag, + }); + state.scan_position = position_before_start_tag + 1; + } + found +} diff --git a/parse_wiki_text/src/template.rs b/parse_wiki_text/src/template.rs new file mode 100644 index 0000000..494644b --- /dev/null +++ b/parse_wiki_text/src/template.rs @@ -0,0 +1,248 @@ +// Copyright 2019 Fredrik Portström <https://portstrom.com> +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +pub fn parse_parameter_name_end(state: &mut crate::State) { + let stack_length = state.stack.len(); + if stack_length > 0 { + if let crate::OpenNode { + type_: + crate::OpenNodeType::Template { + name: Some(_), + parameters, + }, + .. + } = &mut state.stack[stack_length - 1] + { + let parameters_length = parameters.len(); + let name = &mut parameters[parameters_length - 1].name; + if name.is_none() { + crate::state::flush( + &mut state.nodes, + state.flushed_position, + crate::state::skip_whitespace_backwards(state.wiki_text, state.scan_position), + state.wiki_text, + ); + state.flushed_position = crate::state::skip_whitespace_forwards( + state.wiki_text, + state.scan_position + 1, + ); + state.scan_position = state.flushed_position; + *name = Some(std::mem::replace(&mut state.nodes, vec![])); + return; + } + } + } + state.scan_position += 1; +} + +pub fn parse_parameter_separator(state: &mut crate::State) { + match state.stack.last_mut() { + Some(crate::OpenNode { + type_: crate::OpenNodeType::Parameter { default, name }, + .. + }) => { + if name.is_none() { + let position = + crate::state::skip_whitespace_backwards(state.wiki_text, state.scan_position); + crate::state::flush( + &mut state.nodes, + state.flushed_position, + position, + state.wiki_text, + ); + *name = Some(std::mem::replace(&mut state.nodes, vec![])); + } else { + crate::state::flush( + &mut state.nodes, + state.flushed_position, + state.scan_position, + state.wiki_text, + ); + *default = Some(std::mem::replace(&mut state.nodes, vec![])); + state.warnings.push(crate::Warning { + end: state.scan_position + 1, + message: crate::WarningMessage::UselessTextInParameter, + start: state.scan_position, + }); + } + state.scan_position += 1; + state.flushed_position = state.scan_position; + } + _ => unreachable!(), + } +} + +pub fn parse_template_end(state: &mut crate::State) { + match state.stack.last() { + Some(crate::OpenNode { + type_: crate::OpenNodeType::Parameter { .. }, + .. + }) => match state.stack.pop() { + Some(crate::OpenNode { + nodes, + start, + type_: crate::OpenNodeType::Parameter { default, name }, + }) => { + if state.get_byte(state.scan_position + 2) == Some(b'}') { + if let Some(name) = name { + let start_position = state.scan_position; + state.flush(start_position); + let nodes = std::mem::replace(&mut state.nodes, nodes); + state.nodes.push(crate::Node::Parameter { + default: Some(default.unwrap_or(nodes)), + end: state.scan_position, + name, + start, + }); + } else { + let start_position = state.skip_whitespace_backwards(state.scan_position); + state.flush(start_position); + let nodes = std::mem::replace(&mut state.nodes, nodes); + state.nodes.push(crate::Node::Parameter { + default: None, + end: state.scan_position, + name: nodes, + start, + }); + } + state.scan_position += 3; + state.flushed_position = state.scan_position; + } else { + state.warnings.push(crate::Warning { + end: state.scan_position + 2, + message: crate::WarningMessage::UnexpectedEndTagRewinding, + start: state.scan_position, + }); + state.rewind(nodes, start); + } + } + _ => unreachable!(), + }, + Some(crate::OpenNode { + type_: crate::OpenNodeType::Template { .. }, + .. + }) => match state.stack.pop() { + Some(crate::OpenNode { + nodes, + start, + type_: + crate::OpenNodeType::Template { + name, + mut parameters, + }, + }) => { + let position = state.skip_whitespace_backwards(state.scan_position); + state.flush(position); + state.scan_position += 2; + state.flushed_position = state.scan_position; + let name = match name { + None => std::mem::replace(&mut state.nodes, nodes), + Some(name) => { + let parameters_length = parameters.len(); + let parameter = &mut parameters[parameters_length - 1]; + parameter.end = position; + parameter.value = std::mem::replace(&mut state.nodes, nodes); + name + } + }; + state.nodes.push(crate::Node::Template { + end: state.scan_position, + name, + parameters, + start, + }); + } + _ => unreachable!(), + }, + _ => { + if state + .stack + .iter() + .rev() + .skip(1) + .any(|item| match item.type_ { + crate::OpenNodeType::Parameter { .. } => { + state.get_byte(state.scan_position + 2) == Some(b'}') + } + crate::OpenNodeType::Template { .. } => true, + _ => false, + }) + { + state.warnings.push(crate::Warning { + end: state.scan_position + 2, + message: crate::WarningMessage::UnexpectedEndTagRewinding, + start: state.scan_position, + }); + let open_node = state.stack.pop().unwrap(); + state.rewind(open_node.nodes, open_node.start); + } else { + state.warnings.push(crate::Warning { + end: state.scan_position + 2, + message: crate::WarningMessage::UnexpectedEndTag, + start: state.scan_position, + }); + state.scan_position += 2; + } + } + } +} + +pub fn parse_template_separator(state: &mut crate::State) { + match state.stack.last_mut() { + Some(crate::OpenNode { + type_: crate::OpenNodeType::Template { name, parameters }, + .. + }) => { + let position = + crate::state::skip_whitespace_backwards(state.wiki_text, state.scan_position); + crate::state::flush( + &mut state.nodes, + state.flushed_position, + position, + state.wiki_text, + ); + state.flushed_position = + crate::state::skip_whitespace_forwards(state.wiki_text, state.scan_position + 1); + state.scan_position = state.flushed_position; + if name.is_none() { + *name = Some(std::mem::replace(&mut state.nodes, vec![])); + } else { + let parameters_length = parameters.len(); + let parameter = &mut parameters[parameters_length - 1]; + parameter.end = position; + parameter.value = std::mem::replace(&mut state.nodes, vec![]); + } + parameters.push(crate::Parameter { + end: 0, + name: None, + start: state.scan_position, + value: vec![], + }); + } + _ => unreachable!(), + } +} + +pub fn parse_template_start(state: &mut crate::State) { + let scan_position = state.scan_position; + if state.get_byte(state.scan_position + 2) == Some(b'{') { + let position = state.skip_whitespace_forwards(scan_position + 3); + state.push_open_node( + crate::OpenNodeType::Parameter { + default: None, + name: None, + }, + position, + ); + } else { + let position = state.skip_whitespace_forwards(scan_position + 2); + state.push_open_node( + crate::OpenNodeType::Template { + name: None, + parameters: vec![], + }, + position, + ); + } +} diff --git a/parse_wiki_text/src/trie.rs b/parse_wiki_text/src/trie.rs new file mode 100644 index 0000000..f5e9dd2 --- /dev/null +++ b/parse_wiki_text/src/trie.rs @@ -0,0 +1,167 @@ +// Copyright 2019 Fredrik Portström <https://portstrom.com> +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +use crate::case_folding_simple::CASE_FOLDING_SIMPLE; + +struct Character<T> { + character: u8, + next_state: State<T>, +} + +#[derive(Clone, Copy)] +enum State<T> { + Continue(u32), + Final(T), +} + +pub struct Trie<T> { + states: Vec<Vec<Character<T>>>, +} + +impl<T: Copy> Trie<T> { + pub fn add_case_sensitive_term(&mut self, term: &str, payload: T) -> bool { + self.add_term_internal(term, payload, false) + } + + fn add_folded_characters(&mut self, character: char, initial_state: u32, next_state: State<T>) { + if let Some(folded_characters) = simple_fold(character) { + for character in folded_characters { + let mut last_state = initial_state; + let mut character_buffer = [0; 4]; + let character_bytes = character.encode_utf8(&mut character_buffer).as_bytes(); + let mut byte_iterator = character_bytes.iter().cloned(); + let mut byte_item = byte_iterator.next(); + 'b: while let Some(byte) = byte_item { + for item in &self.states[last_state as usize] { + if item.character == byte { + match item.next_state { + State::Continue(next_state) => last_state = next_state, + State::Final(_) => unreachable!(), + } + byte_item = byte_iterator.next(); + continue 'b; + } + } + byte_item = byte_iterator.next(); + if byte_item.is_none() { + self.states[last_state as usize].push(Character { + character: byte, + next_state, + }); + break; + } + let intermediate_state = self.states.len() as _; + self.states[last_state as usize].push(Character { + character: byte, + next_state: State::Continue(intermediate_state), + }); + last_state = intermediate_state; + self.states.push(vec![]); + } + } + } + } + + pub fn add_term(&mut self, term: &str, payload: T) -> bool { + self.add_term_internal(term, payload, true) + } + + fn add_term_internal(&mut self, term: &str, payload: T, case_folded: bool) -> bool { + let mut last_state = 0; + let mut character_iterator = term.chars(); + let mut character_item = character_iterator.next(); + while let Some(character) = character_item { + let mut character_buffer = [0; 4]; + let character_bytes = character.encode_utf8(&mut character_buffer).as_bytes(); + character_item = character_iterator.next(); + let mut byte_iterator = character_bytes.iter().cloned(); + let mut byte_item = byte_iterator.next(); + let state_before_character = last_state; + 'a: while let Some(byte) = byte_item { + for item in &self.states[last_state as usize] { + if item.character == byte { + match item.next_state { + State::Continue(next_state) => last_state = next_state, + State::Final(_) => return false, + } + byte_item = byte_iterator.next(); + continue 'a; + } + } + byte_item = byte_iterator.next(); + if byte_item.is_none() { + if character_item.is_none() { + self.states[last_state as usize].push(Character { + character: byte, + next_state: State::Final(payload), + }); + if case_folded { + self.add_folded_characters( + character, + state_before_character, + State::Final(payload), + ); + } + return true; + } + let next_state = self.states.len() as _; + self.states[last_state as usize].push(Character { + character: byte, + next_state: State::Continue(next_state), + }); + self.states.push(vec![]); + if case_folded { + self.add_folded_characters( + character, + state_before_character, + State::Continue(next_state), + ); + } + last_state = next_state; + break; + } + let next_state = self.states.len() as _; + self.states[last_state as usize].push(Character { + character: byte, + next_state: State::Continue(next_state), + }); + last_state = next_state; + self.states.push(vec![]); + } + } + false + } + + pub fn find(&self, text: &str) -> Result<(usize, T), usize> { + let mut state = 0; + 'outer: for (position, character1) in text.as_bytes().iter().cloned().enumerate() { + for character2 in &self.states[state as usize] { + if character1 == character2.character { + match character2.next_state { + State::Continue(next_state) => { + state = next_state; + continue 'outer; + } + State::Final(payload) => return Ok((position + 1, payload)), + } + } + } + return Err(position); + } + Err(0) + } + + pub fn new() -> Self { + Trie { + states: vec![vec![]], + } + } +} + +fn simple_fold(character: char) -> Option<&'static [char]> { + match CASE_FOLDING_SIMPLE.binary_search_by_key(&character, |&(character, _)| character) { + Err(_) => None, + Ok(index) => Some(CASE_FOLDING_SIMPLE[index].1), + } +} diff --git a/parse_wiki_text/src/warning.rs b/parse_wiki_text/src/warning.rs new file mode 100644 index 0000000..22e1421 --- /dev/null +++ b/parse_wiki_text/src/warning.rs @@ -0,0 +1,110 @@ +// Copyright 2019 Fredrik Portström <https://portstrom.com> +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +use std::fmt; + +/// Warning from the parser telling that something is not well-formed. +#[derive(Debug)] +pub struct Warning { + /// The byte position in the wiki text where the warning ends. + pub end: usize, + + /// An identifier for the kind of warning. + pub message: WarningMessage, + + /// The byte position in the wiki text where the warning starts. + pub start: usize, +} + +/// Identifier for a kind of warning from the parser. +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub enum WarningMessage { + /// List broken by definition term. + DefinitionTermContinuation, + + /// End tag in comment. + EndTagInComment, + + /// Invalid character. + InvalidCharacter, + + /// Invalid heading syntax. Rewinding. + InvalidHeadingSyntaxRewinding, + + /// Invalid link syntax. + InvalidLinkSyntax, + + /// Invalid parameter syntax. + InvalidParameterSyntax, + + /// Invalid tag syntax. + InvalidTagSyntax, + + /// Missing end tag. Rewinding. + MissingEndTagRewinding, + + /// Repeated empty line. + RepeatedEmptyLine, + + /// Stray text in table. + StrayTextInTable, + + /// Wiki text comes after a redirect. + TextAfterRedirect, + + /// The end tag does not match the last start tag. Rewinding. + UnexpectedEndTagRewinding, + + /// An end tag was found with no preceeding start tag. + UnexpectedEndTag, + + /// Expected heading of higher level. Correcting start of heading. + UnexpectedHeadingLevelCorrecting, + + /// A tag with an unrecognized tag name was found. + UnrecognizedTagName, + + /// Useless text in parameter. + UselessTextInParameter, + + /// Useless text in redirect. + UselessTextInRedirect, +} + +impl WarningMessage { + /// Human-readable description of the warning. + pub fn message(self) -> &'static str { + match self { + WarningMessage::DefinitionTermContinuation => "List broken by definition term.", + WarningMessage::EndTagInComment => "End tag in comment.", + WarningMessage::InvalidCharacter => "Invalid character.", + WarningMessage::InvalidHeadingSyntaxRewinding => "Invalid heading syntax. Rewinding.", + WarningMessage::InvalidLinkSyntax => "Invalid link syntax.", + WarningMessage::InvalidParameterSyntax => "Invalid parameter syntax.", + WarningMessage::InvalidTagSyntax => "Invalid tag syntax.", + WarningMessage::MissingEndTagRewinding => "Missing end tag. Rewinding.", + WarningMessage::RepeatedEmptyLine => "Repeated empty line.", + WarningMessage::StrayTextInTable => "Stray text in table.", + WarningMessage::TextAfterRedirect => "Wiki text comes after a redirect.", + WarningMessage::UnexpectedEndTagRewinding => { + "The end tag does not match the last start tag. Rewinding." + } + WarningMessage::UnexpectedEndTag => { + "An end tag was found with no preceeding start tag." + } + WarningMessage::UnexpectedHeadingLevelCorrecting => { + "Expected heading of higher level. Correcting start of heading." + } + WarningMessage::UnrecognizedTagName => "A tag with an unrecognized tag name was found.", + WarningMessage::UselessTextInParameter => "Useless text in parameter.", + WarningMessage::UselessTextInRedirect => "Useless text in redirect.", + } + } +} + +impl fmt::Display for WarningMessage { + fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + formatter.write_str(self.message()) + } +} |