Fix Unicode (#135)

* init * wip * wip * fix unicode break * fix unicode break * Update helix-core/src/transaction.rs Co-authored-by: Benoît Cortier <benoit.cortier@fried-world.eu> * clippy * fix * add changes * added test * wip * wip * wip * wip * fix * fix view * fix #88 Co-authored-by: Benoît Cortier <benoit.cortier@fried-world.eu>
author: Kirawi 2021-06-08 04:20:15 +0000
committer: GitHub 2021-06-08 04:20:15 +0000
commit: b873fb9897bb5b24a60cca3d9fa69285446a857f (patch)
tree: fe13df2f3f57990f5e6ef93b0049457855d4ed76 /helix-core/src
parent: 8f1eb7b2b03fd6907307f4e0065d0c43da22edb3 (diff)
3 files changed, 80 insertions, 21 deletions
diff --git a/helix-core/src/movement.rs b/helix-core/src/movement.rs
index 96bbd54b..9d62b16c 100644
--- a/helix-core/src/movement.rs
+++ b/helix-core/src/movement.rs
@@ -88,11 +88,11 @@ pub fn move_next_word_start(slice: RopeSlice, mut begin: usize, count: usize) ->
 
         if is_word(ch) {
             skip_over_next(slice, &mut end, is_word);
-        } else if ch.is_ascii_punctuation() {
-            skip_over_next(slice, &mut end, |ch| ch.is_ascii_punctuation());
+        } else if is_punctuation(ch) {
+            skip_over_next(slice, &mut end, is_punctuation);
         }
 
-        skip_over_next(slice, &mut end, is_horiz_blank);
+        skip_over_next(slice, &mut end, char::is_whitespace);
     }
 
     Some(Range::new(begin, end - 1))
@@ -119,15 +119,15 @@ pub fn move_prev_word_start(slice: RopeSlice, mut begin: usize, count: usize) ->
 
         end = begin;
 
-        with_end = skip_over_prev(slice, &mut end, is_horiz_blank);
+        with_end = skip_over_prev(slice, &mut end, char::is_whitespace);
 
         // refetch
         let ch = slice.char(end);
 
         if is_word(ch) {
             with_end = skip_over_prev(slice, &mut end, is_word);
-        } else if ch.is_ascii_punctuation() {
-            with_end = skip_over_prev(slice, &mut end, |ch| ch.is_ascii_punctuation());
+        } else if is_punctuation(ch) {
+            with_end = skip_over_prev(slice, &mut end, is_punctuation);
         }
     }
 
@@ -155,15 +155,15 @@ pub fn move_next_word_end(slice: RopeSlice, mut begin: usize, count: usize) -> O
 
         end = begin;
 
-        skip_over_next(slice, &mut end, is_horiz_blank);
+        skip_over_next(slice, &mut end, char::is_whitespace);
 
         // refetch
         let ch = slice.char(end);
 
         if is_word(ch) {
             skip_over_next(slice, &mut end, is_word);
-        } else if ch.is_ascii_punctuation() {
-            skip_over_next(slice, &mut end, |ch| ch.is_ascii_punctuation());
+        } else if is_punctuation(ch) {
+            skip_over_next(slice, &mut end, is_punctuation);
         }
     }
 
@@ -174,12 +174,28 @@ pub fn move_next_word_end(slice: RopeSlice, mut begin: usize, count: usize) -> O
 
 // used for by-word movement
 
+#[inline]
 pub(crate) fn is_word(ch: char) -> bool {
     ch.is_alphanumeric() || ch == '_'
 }
 
-pub(crate) fn is_horiz_blank(ch: char) -> bool {
-    matches!(ch, ' ' | '\t')
+#[inline]
+pub(crate) fn is_punctuation(ch: char) -> bool {
+    use unicode_general_category::{get_general_category, GeneralCategory};
+
+    matches!(
+        get_general_category(ch),
+        GeneralCategory::OtherPunctuation
+            | GeneralCategory::OpenPunctuation
+            | GeneralCategory::ClosePunctuation
+            | GeneralCategory::InitialPunctuation
+            | GeneralCategory::FinalPunctuation
+            | GeneralCategory::ConnectorPunctuation
+            | GeneralCategory::DashPunctuation
+            | GeneralCategory::MathSymbol
+            | GeneralCategory::CurrencySymbol
+            | GeneralCategory::ModifierSymbol
+    )
 }
 
 #[derive(Debug, Eq, PartialEq)]
@@ -191,14 +207,15 @@ pub(crate) enum Category {
     Unknown,
 }
 
+#[inline]
 pub(crate) fn categorize(ch: char) -> Category {
     if ch == '\n' {
         Category::Eol
-    } else if ch.is_ascii_whitespace() {
+    } else if ch.is_whitespace() {
         Category::Whitespace
     } else if is_word(ch) {
         Category::Word
-    } else if ch.is_ascii_punctuation() {
+    } else if is_punctuation(ch) {
         Category::Punctuation
     } else {
         Category::Unknown
@@ -213,6 +230,7 @@ where
 {
     let mut chars = slice.chars_at(*pos);
 
+    #[allow(clippy::while_let_on_iterator)]
     while let Some(ch) = chars.next() {
         if !fun(ch) {
             break;
@@ -231,6 +249,7 @@ where
     // need to +1 so that prev() includes current char
     let mut chars = slice.chars_at(*pos + 1);
 
+    #[allow(clippy::while_let_on_iterator)]
     while let Some(ch) = chars.prev() {
         if !fun(ch) {
             break;
@@ -259,4 +278,44 @@ mod test {
             (1, 2).into()
         );
     }
+
+    #[test]
+    fn test_categorize() {
+        const WORD_TEST_CASE: &'static str =
+            "_hello_world_あいうえおー1234567890１２３４５６７８９０";
+        const PUNCTUATION_TEST_CASE: &'static str = "!\"#$%&\'()*+,-./:;<=>?@[\\]^`{|}~！”＃＄％＆’（）＊＋、。：；＜＝＞？＠「」＾｀｛｜｝～";
+        const WHITESPACE_TEST_CASE: &'static str = "  　   ";
+
+        assert_eq!(Category::Eol, categorize('\n'));
+
+        for ch in WHITESPACE_TEST_CASE.chars() {
+            assert_eq!(
+                Category::Whitespace,
+                categorize(ch),
+                "Testing '{}', but got `{:?}` instead of `Category::Whitespace`",
+                ch,
+                categorize(ch)
+            );
+        }
+
+        for ch in WORD_TEST_CASE.chars() {
+            assert_eq!(
+                Category::Word,
+                categorize(ch),
+                "Testing '{}', but got `{:?}` instead of `Category::Word`",
+                ch,
+                categorize(ch)
+            );
+        }
+
+        for ch in PUNCTUATION_TEST_CASE.chars() {
+            assert_eq!(
+                Category::Punctuation,
+                categorize(ch),
+                "Testing '{}', but got `{:?}` instead of `Category::Punctuation`",
+                ch,
+                categorize(ch)
+            );
+        }
+    }
 }
diff --git a/helix-core/src/transaction.rs b/helix-core/src/transaction.rs
index 085f40b7..3a719628 100644
--- a/helix-core/src/transaction.rs
+++ b/helix-core/src/transaction.rs
@@ -758,7 +758,7 @@ mod test {
 
     #[test]
     fn combine_with_utf8() {
-        const TEST_CASE: &'static str = "Hello, これはヒレクスエディターです！";
+        const TEST_CASE: &'static str = "Hello, これはヘリックスエディターです！";
 
         let empty = Rope::from("");
         let mut a = ChangeSet::new(&empty);
diff --git a/helix-core/src/words.rs b/helix-core/src/words.rs
index 5ff25050..2cbd88d4 100644
--- a/helix-core/src/words.rs
+++ b/helix-core/src/words.rs
@@ -1,4 +1,4 @@
-use crate::movement::{categorize, is_horiz_blank, is_word, skip_over_prev};
+use crate::movement::{categorize, is_punctuation, is_word, skip_over_prev};
 use ropey::RopeSlice;
 
 #[must_use]
@@ -13,15 +13,15 @@ pub fn nth_prev_word_boundary(slice: RopeSlice, mut char_idx: usize, count: usiz
         // return if not skip while?
         skip_over_prev(slice, &mut char_idx, |ch| ch == '\n');
 
-        with_end = skip_over_prev(slice, &mut char_idx, is_horiz_blank);
+        with_end = skip_over_prev(slice, &mut char_idx, char::is_whitespace);
 
         // refetch
         let ch = slice.char(char_idx);
 
         if is_word(ch) {
             with_end = skip_over_prev(slice, &mut char_idx, is_word);
-        } else if ch.is_ascii_punctuation() {
-            with_end = skip_over_prev(slice, &mut char_idx, |ch| ch.is_ascii_punctuation());
+        } else if is_punctuation(ch) {
+            with_end = skip_over_prev(slice, &mut char_idx, is_punctuation);
         }
     }
 
@@ -47,11 +47,11 @@ fn different_prev_word_boundary() {
     t("hello, world", "hello, ");
     t("hello, ", "hello");
     t("hello", "");
-    t("こんにちは、世界！", "こんにちは、世界！"); // TODO: punctuation
+    t("こんにちは、世界！", "こんにちは、世界");
     t("こんにちは、世界", "こんにちは、");
-    t("こんにちは、", "こんにちは、"); // what?
+    t("こんにちは、", "こんにちは");
     t("こんにちは", "");
-    t("この世界。", "この世界。"); // what?
+    t("この世界。", "この世界");
     t("この世界", "");
     t("お前はもう死んでいる", "");
     t("その300円です", ""); // TODO: should stop at 300
author	Kirawi	2021-06-08 04:20:15 +0000
committer	GitHub	2021-06-08 04:20:15 +0000
commit	b873fb9897bb5b24a60cca3d9fa69285446a857f (patch)
tree	fe13df2f3f57990f5e6ef93b0049457855d4ed76 /helix-core/src
parent	8f1eb7b2b03fd6907307f4e0065d0c43da22edb3 (diff)