diff options
Diffstat (limited to 'parse_wiki_text/src/trie.rs')
-rw-r--r-- | parse_wiki_text/src/trie.rs | 167 |
1 files changed, 167 insertions, 0 deletions
diff --git a/parse_wiki_text/src/trie.rs b/parse_wiki_text/src/trie.rs new file mode 100644 index 0000000..f5e9dd2 --- /dev/null +++ b/parse_wiki_text/src/trie.rs @@ -0,0 +1,167 @@ +// Copyright 2019 Fredrik Portström <https://portstrom.com> +// This is free software distributed under the terms specified in +// the file LICENSE at the top-level directory of this distribution. + +use crate::case_folding_simple::CASE_FOLDING_SIMPLE; + +struct Character<T> { + character: u8, + next_state: State<T>, +} + +#[derive(Clone, Copy)] +enum State<T> { + Continue(u32), + Final(T), +} + +pub struct Trie<T> { + states: Vec<Vec<Character<T>>>, +} + +impl<T: Copy> Trie<T> { + pub fn add_case_sensitive_term(&mut self, term: &str, payload: T) -> bool { + self.add_term_internal(term, payload, false) + } + + fn add_folded_characters(&mut self, character: char, initial_state: u32, next_state: State<T>) { + if let Some(folded_characters) = simple_fold(character) { + for character in folded_characters { + let mut last_state = initial_state; + let mut character_buffer = [0; 4]; + let character_bytes = character.encode_utf8(&mut character_buffer).as_bytes(); + let mut byte_iterator = character_bytes.iter().cloned(); + let mut byte_item = byte_iterator.next(); + 'b: while let Some(byte) = byte_item { + for item in &self.states[last_state as usize] { + if item.character == byte { + match item.next_state { + State::Continue(next_state) => last_state = next_state, + State::Final(_) => unreachable!(), + } + byte_item = byte_iterator.next(); + continue 'b; + } + } + byte_item = byte_iterator.next(); + if byte_item.is_none() { + self.states[last_state as usize].push(Character { + character: byte, + next_state, + }); + break; + } + let intermediate_state = self.states.len() as _; + self.states[last_state as usize].push(Character { + character: byte, + next_state: State::Continue(intermediate_state), + }); + last_state = intermediate_state; + self.states.push(vec![]); + } + } + } + } + + pub fn add_term(&mut self, term: &str, payload: T) -> bool { + self.add_term_internal(term, payload, true) + } + + fn add_term_internal(&mut self, term: &str, payload: T, case_folded: bool) -> bool { + let mut last_state = 0; + let mut character_iterator = term.chars(); + let mut character_item = character_iterator.next(); + while let Some(character) = character_item { + let mut character_buffer = [0; 4]; + let character_bytes = character.encode_utf8(&mut character_buffer).as_bytes(); + character_item = character_iterator.next(); + let mut byte_iterator = character_bytes.iter().cloned(); + let mut byte_item = byte_iterator.next(); + let state_before_character = last_state; + 'a: while let Some(byte) = byte_item { + for item in &self.states[last_state as usize] { + if item.character == byte { + match item.next_state { + State::Continue(next_state) => last_state = next_state, + State::Final(_) => return false, + } + byte_item = byte_iterator.next(); + continue 'a; + } + } + byte_item = byte_iterator.next(); + if byte_item.is_none() { + if character_item.is_none() { + self.states[last_state as usize].push(Character { + character: byte, + next_state: State::Final(payload), + }); + if case_folded { + self.add_folded_characters( + character, + state_before_character, + State::Final(payload), + ); + } + return true; + } + let next_state = self.states.len() as _; + self.states[last_state as usize].push(Character { + character: byte, + next_state: State::Continue(next_state), + }); + self.states.push(vec![]); + if case_folded { + self.add_folded_characters( + character, + state_before_character, + State::Continue(next_state), + ); + } + last_state = next_state; + break; + } + let next_state = self.states.len() as _; + self.states[last_state as usize].push(Character { + character: byte, + next_state: State::Continue(next_state), + }); + last_state = next_state; + self.states.push(vec![]); + } + } + false + } + + pub fn find(&self, text: &str) -> Result<(usize, T), usize> { + let mut state = 0; + 'outer: for (position, character1) in text.as_bytes().iter().cloned().enumerate() { + for character2 in &self.states[state as usize] { + if character1 == character2.character { + match character2.next_state { + State::Continue(next_state) => { + state = next_state; + continue 'outer; + } + State::Final(payload) => return Ok((position + 1, payload)), + } + } + } + return Err(position); + } + Err(0) + } + + pub fn new() -> Self { + Trie { + states: vec![vec![]], + } + } +} + +fn simple_fold(character: char) -> Option<&'static [char]> { + match CASE_FOLDING_SIMPLE.binary_search_by_key(&character, |&(character, _)| character) { + Err(_) => None, + Ok(index) => Some(CASE_FOLDING_SIMPLE[index].1), + } +} |