summaryrefslogtreecommitdiff
path: root/helix-core/src/chars.rs
diff options
context:
space:
mode:
Diffstat (limited to 'helix-core/src/chars.rs')
-rw-r--r--helix-core/src/chars.rs122
1 files changed, 107 insertions, 15 deletions
diff --git a/helix-core/src/chars.rs b/helix-core/src/chars.rs
index 243a1374..24133dd3 100644
--- a/helix-core/src/chars.rs
+++ b/helix-core/src/chars.rs
@@ -1,25 +1,44 @@
-/// Determine whether a character is a line break.
-pub fn char_is_linebreak(c: char) -> bool {
- matches!(
- c,
- '\u{000A}' | // LineFeed
- '\u{000B}' | // VerticalTab
- '\u{000C}' | // FormFeed
- '\u{000D}' | // CarriageReturn
- '\u{0085}' | // NextLine
- '\u{2028}' | // Line Separator
- '\u{2029}' // ParagraphSeparator
- )
+use crate::LineEnding;
+
+#[derive(Debug, Eq, PartialEq)]
+pub enum CharCategory {
+ Whitespace,
+ Eol,
+ Word,
+ Punctuation,
+ Unknown,
+}
+
+#[inline]
+pub fn categorize_char(ch: char) -> CharCategory {
+ if char_is_line_ending(ch) {
+ CharCategory::Eol
+ } else if ch.is_whitespace() {
+ CharCategory::Whitespace
+ } else if char_is_word(ch) {
+ CharCategory::Word
+ } else if char_is_punctuation(ch) {
+ CharCategory::Punctuation
+ } else {
+ CharCategory::Unknown
+ }
+}
+
+/// Determine whether a character is a line ending.
+#[inline]
+pub fn char_is_line_ending(ch: char) -> bool {
+ LineEnding::from_char(ch).is_some()
}
/// Determine whether a character qualifies as (non-line-break)
/// whitespace.
-pub fn char_is_whitespace(c: char) -> bool {
+#[inline]
+pub fn char_is_whitespace(ch: char) -> bool {
// TODO: this is a naive binary categorization of whitespace
// characters. For display, word wrapping, etc. we'll need a better
// categorization based on e.g. breaking vs non-breaking spaces
// and whether they're zero-width or not.
- match c {
+ match ch {
//'\u{1680}' | // Ogham Space Mark (here for completeness, but usually displayed as a dash, not as whitespace)
'\u{0009}' | // Character Tabulation
'\u{0020}' | // Space
@@ -34,8 +53,81 @@ pub fn char_is_whitespace(c: char) -> bool {
// En Quad, Em Quad, En Space, Em Space, Three-per-em Space,
// Four-per-em Space, Six-per-em Space, Figure Space,
// Punctuation Space, Thin Space, Hair Space, Zero Width Space.
- c if ('\u{2000}' ..= '\u{200B}').contains(&c) => true,
+ ch if ('\u{2000}' ..= '\u{200B}').contains(&ch) => true,
_ => false,
}
}
+
+#[inline]
+pub fn char_is_punctuation(ch: char) -> bool {
+ use unicode_general_category::{get_general_category, GeneralCategory};
+
+ matches!(
+ get_general_category(ch),
+ GeneralCategory::OtherPunctuation
+ | GeneralCategory::OpenPunctuation
+ | GeneralCategory::ClosePunctuation
+ | GeneralCategory::InitialPunctuation
+ | GeneralCategory::FinalPunctuation
+ | GeneralCategory::ConnectorPunctuation
+ | GeneralCategory::DashPunctuation
+ | GeneralCategory::MathSymbol
+ | GeneralCategory::CurrencySymbol
+ | GeneralCategory::ModifierSymbol
+ )
+}
+
+#[inline]
+pub fn char_is_word(ch: char) -> bool {
+ ch.is_alphanumeric() || ch == '_'
+}
+
+#[cfg(test)]
+mod test {
+ use super::*;
+
+ #[test]
+ fn test_categorize() {
+ const EOL_TEST_CASE: &'static str = "\n\r\u{000B}\u{000C}\u{0085}\u{2028}\u{2029}";
+ const WORD_TEST_CASE: &'static str =
+ "_hello_world_あいうえおー12345678901234567890";
+ const PUNCTUATION_TEST_CASE: &'static str =
+ "!\"#$%&\'()*+,-./:;<=>?@[\\]^`{|}~!”#$%&’()*+、。:;<=>?@「」^`{|}~";
+ const WHITESPACE_TEST_CASE: &'static str = "      ";
+
+ for ch in EOL_TEST_CASE.chars() {
+ assert_eq!(CharCategory::Eol, categorize_char(ch));
+ }
+
+ for ch in WHITESPACE_TEST_CASE.chars() {
+ assert_eq!(
+ CharCategory::Whitespace,
+ categorize_char(ch),
+ "Testing '{}', but got `{:?}` instead of `Category::Whitespace`",
+ ch,
+ categorize_char(ch)
+ );
+ }
+
+ for ch in WORD_TEST_CASE.chars() {
+ assert_eq!(
+ CharCategory::Word,
+ categorize_char(ch),
+ "Testing '{}', but got `{:?}` instead of `Category::Word`",
+ ch,
+ categorize_char(ch)
+ );
+ }
+
+ for ch in PUNCTUATION_TEST_CASE.chars() {
+ assert_eq!(
+ CharCategory::Punctuation,
+ categorize_char(ch),
+ "Testing '{}', but got `{:?}` instead of `Category::Punctuation`",
+ ch,
+ categorize_char(ch)
+ );
+ }
+ }
+}