diff options
Diffstat (limited to 'helix-core/src/chars.rs')
-rw-r--r-- | helix-core/src/chars.rs | 122 |
1 files changed, 107 insertions, 15 deletions
diff --git a/helix-core/src/chars.rs b/helix-core/src/chars.rs index 243a1374..24133dd3 100644 --- a/helix-core/src/chars.rs +++ b/helix-core/src/chars.rs @@ -1,25 +1,44 @@ -/// Determine whether a character is a line break. -pub fn char_is_linebreak(c: char) -> bool { - matches!( - c, - '\u{000A}' | // LineFeed - '\u{000B}' | // VerticalTab - '\u{000C}' | // FormFeed - '\u{000D}' | // CarriageReturn - '\u{0085}' | // NextLine - '\u{2028}' | // Line Separator - '\u{2029}' // ParagraphSeparator - ) +use crate::LineEnding; + +#[derive(Debug, Eq, PartialEq)] +pub enum CharCategory { + Whitespace, + Eol, + Word, + Punctuation, + Unknown, +} + +#[inline] +pub fn categorize_char(ch: char) -> CharCategory { + if char_is_line_ending(ch) { + CharCategory::Eol + } else if ch.is_whitespace() { + CharCategory::Whitespace + } else if char_is_word(ch) { + CharCategory::Word + } else if char_is_punctuation(ch) { + CharCategory::Punctuation + } else { + CharCategory::Unknown + } +} + +/// Determine whether a character is a line ending. +#[inline] +pub fn char_is_line_ending(ch: char) -> bool { + LineEnding::from_char(ch).is_some() } /// Determine whether a character qualifies as (non-line-break) /// whitespace. -pub fn char_is_whitespace(c: char) -> bool { +#[inline] +pub fn char_is_whitespace(ch: char) -> bool { // TODO: this is a naive binary categorization of whitespace // characters. For display, word wrapping, etc. we'll need a better // categorization based on e.g. breaking vs non-breaking spaces // and whether they're zero-width or not. - match c { + match ch { //'\u{1680}' | // Ogham Space Mark (here for completeness, but usually displayed as a dash, not as whitespace) '\u{0009}' | // Character Tabulation '\u{0020}' | // Space @@ -34,8 +53,81 @@ pub fn char_is_whitespace(c: char) -> bool { // En Quad, Em Quad, En Space, Em Space, Three-per-em Space, // Four-per-em Space, Six-per-em Space, Figure Space, // Punctuation Space, Thin Space, Hair Space, Zero Width Space. - c if ('\u{2000}' ..= '\u{200B}').contains(&c) => true, + ch if ('\u{2000}' ..= '\u{200B}').contains(&ch) => true, _ => false, } } + +#[inline] +pub fn char_is_punctuation(ch: char) -> bool { + use unicode_general_category::{get_general_category, GeneralCategory}; + + matches!( + get_general_category(ch), + GeneralCategory::OtherPunctuation + | GeneralCategory::OpenPunctuation + | GeneralCategory::ClosePunctuation + | GeneralCategory::InitialPunctuation + | GeneralCategory::FinalPunctuation + | GeneralCategory::ConnectorPunctuation + | GeneralCategory::DashPunctuation + | GeneralCategory::MathSymbol + | GeneralCategory::CurrencySymbol + | GeneralCategory::ModifierSymbol + ) +} + +#[inline] +pub fn char_is_word(ch: char) -> bool { + ch.is_alphanumeric() || ch == '_' +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_categorize() { + const EOL_TEST_CASE: &'static str = "\n\r\u{000B}\u{000C}\u{0085}\u{2028}\u{2029}"; + const WORD_TEST_CASE: &'static str = + "_hello_world_あいうえおー12345678901234567890"; + const PUNCTUATION_TEST_CASE: &'static str = + "!\"#$%&\'()*+,-./:;<=>?@[\\]^`{|}~!”#$%&’()*+、。:;<=>?@「」^`{|}~"; + const WHITESPACE_TEST_CASE: &'static str = " "; + + for ch in EOL_TEST_CASE.chars() { + assert_eq!(CharCategory::Eol, categorize_char(ch)); + } + + for ch in WHITESPACE_TEST_CASE.chars() { + assert_eq!( + CharCategory::Whitespace, + categorize_char(ch), + "Testing '{}', but got `{:?}` instead of `Category::Whitespace`", + ch, + categorize_char(ch) + ); + } + + for ch in WORD_TEST_CASE.chars() { + assert_eq!( + CharCategory::Word, + categorize_char(ch), + "Testing '{}', but got `{:?}` instead of `Category::Word`", + ch, + categorize_char(ch) + ); + } + + for ch in PUNCTUATION_TEST_CASE.chars() { + assert_eq!( + CharCategory::Punctuation, + categorize_char(ch), + "Testing '{}', but got `{:?}` instead of `Category::Punctuation`", + ch, + categorize_char(ch) + ); + } + } +} |