From a2e04ff18ad27be4dc1c66079941baaec79e003f Mon Sep 17 00:00:00 2001 From: JJ Date: Wed, 4 Jan 2023 15:57:41 -0800 Subject: Copy the last version of the parse_wiki_text crate in for development --- parse_wiki_text/src/configuration.rs | 164 +++++++++++++++++++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 parse_wiki_text/src/configuration.rs (limited to 'parse_wiki_text/src/configuration.rs') diff --git a/parse_wiki_text/src/configuration.rs b/parse_wiki_text/src/configuration.rs new file mode 100644 index 0000000..875a69b --- /dev/null +++ b/parse_wiki_text/src/configuration.rs @@ -0,0 +1,164 @@ +// Copyright 2019 Fredrik Portström +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +/// Site specific configuration of a wiki. +/// +/// This is generated using the program [`fetch_mediawiki_configuration`](https://github.com/portstrom/fetch_mediawiki_configuration). +pub struct ConfigurationSource<'a> { + /// Aliases of the category namespace. + pub category_namespaces: &'a [&'a str], + + /// Tag names of extension tags. + pub extension_tags: &'a [&'a str], + + /// Aliases of the file namespace. + pub file_namespaces: &'a [&'a str], + + /// Characters that can appear in link trails. + pub link_trail: &'a str, + + /// Magic words that can appear between `__` and `__`. + pub magic_words: &'a [&'a str], + + /// Protocols that can be used for external links. + pub protocols: &'a [&'a str], + + /// Magic words that can be used for redirects. + pub redirect_magic_words: &'a [&'a str], +} + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub enum Namespace { + Category, + File, +} + +impl crate::Configuration { + /// Allocates and returns a new configuration based on the given site specific configuration. + #[must_use] + pub fn new(source: &ConfigurationSource) -> Self { + let mut configuration = crate::Configuration { + character_entities: crate::Trie::new(), + link_trail_character_set: crate::HashSet::new(), + magic_words: crate::Trie::new(), + namespaces: crate::Trie::new(), + protocols: crate::Trie::new(), + redirect_magic_words: crate::Trie::new(), + tag_name_map: crate::HashMap::new(), + }; + for (name, character) in crate::html_entities::HTML_ENTITIES { + configuration + .character_entities + .add_case_sensitive_term(&format!("{};", name), *character); + } + for character in source.link_trail.chars() { + configuration.link_trail_character_set.insert(character); + } + for protocol in source.protocols { + configuration.protocols.add_term(protocol, ()); + } + for magic_word in source.magic_words { + configuration.magic_words.add_term(magic_word, ()); + } + for namespace in source.category_namespaces { + configuration + .namespaces + .add_term(&format!("{}:", namespace), Namespace::Category); + } + for namespace in source.file_namespaces { + configuration + .namespaces + .add_term(&format!("{}:", namespace), Namespace::File); + } + for redirect_magic_word in source.redirect_magic_words { + configuration + .redirect_magic_words + .add_term(redirect_magic_word, ()); + } + for tag_name in source.extension_tags { + configuration + .tag_name_map + .insert(tag_name.to_string(), crate::TagClass::ExtensionTag); + } + for tag_name in [ + "abbr", + "b", + "bdi", + "bdo", + "blockquote", + "br", + "caption", + "center", + "cite", + "code", + "data", + "dd", + "del", + "dfn", + "div", + "dl", + "dt", + "em", + "font", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "hr", + "i", + "ins", + "kbd", + "li", + "mark", + "ol", + "p", + "pre", + "q", + "rb", + "rp", + "rt", + "ruby", + "s", + "samp", + "small", + "span", + "strike", + "strong", + "sub", + "sup", + "table", + "td", + "th", + "time", + "tr", + "tt", + "u", + "ul", + "var", + "wbr", + ] + .iter() + { + configuration + .tag_name_map + .insert(tag_name.to_string(), crate::TagClass::Tag); + } + configuration + } + + /// Parses wiki text into structured data. + #[must_use] + pub fn parse<'a>(&self, wiki_text: &'a str) -> crate::Output<'a> { + crate::parse::parse(self, wiki_text) + } +} + +impl Default for crate::Configuration { + /// Allocates and returns a configuration suitable for testing and quick and dirty prototyping. For correctly parsing an actual wiki, please get the correct site configuration for that particular wiki. + fn default() -> Self { + crate::default::create_configuration() + } +} -- cgit v1.2.3-70-g09d2