diff options
author | Kirawi | 2021-06-08 04:20:15 +0000 |
---|---|---|
committer | GitHub | 2021-06-08 04:20:15 +0000 |
commit | b873fb9897bb5b24a60cca3d9fa69285446a857f (patch) | |
tree | fe13df2f3f57990f5e6ef93b0049457855d4ed76 /helix-core | |
parent | 8f1eb7b2b03fd6907307f4e0065d0c43da22edb3 (diff) |
Fix Unicode (#135)
* init
* wip
* wip
* fix unicode break
* fix unicode break
* Update helix-core/src/transaction.rs
Co-authored-by: Benoît Cortier <benoit.cortier@fried-world.eu>
* clippy
* fix
* add changes
* added test
* wip
* wip
* wip
* wip
* fix
* fix view
* fix #88
Co-authored-by: Benoît Cortier <benoit.cortier@fried-world.eu>
Diffstat (limited to 'helix-core')
-rw-r--r-- | helix-core/Cargo.toml | 1 | ||||
-rw-r--r-- | helix-core/src/movement.rs | 85 | ||||
-rw-r--r-- | helix-core/src/transaction.rs | 2 | ||||
-rw-r--r-- | helix-core/src/words.rs | 14 |
4 files changed, 81 insertions, 21 deletions
diff --git a/helix-core/Cargo.toml b/helix-core/Cargo.toml index c544e108..2d7a6ae8 100644 --- a/helix-core/Cargo.toml +++ b/helix-core/Cargo.toml @@ -19,6 +19,7 @@ smallvec = "1.4" tendril = "0.4.2" unicode-segmentation = "1.6" unicode-width = "0.1" +unicode-general-category = "0.4.0" # slab = "0.4.2" tree-sitter = "0.19" once_cell = "1.4" diff --git a/helix-core/src/movement.rs b/helix-core/src/movement.rs index 96bbd54b..9d62b16c 100644 --- a/helix-core/src/movement.rs +++ b/helix-core/src/movement.rs @@ -88,11 +88,11 @@ pub fn move_next_word_start(slice: RopeSlice, mut begin: usize, count: usize) -> if is_word(ch) { skip_over_next(slice, &mut end, is_word); - } else if ch.is_ascii_punctuation() { - skip_over_next(slice, &mut end, |ch| ch.is_ascii_punctuation()); + } else if is_punctuation(ch) { + skip_over_next(slice, &mut end, is_punctuation); } - skip_over_next(slice, &mut end, is_horiz_blank); + skip_over_next(slice, &mut end, char::is_whitespace); } Some(Range::new(begin, end - 1)) @@ -119,15 +119,15 @@ pub fn move_prev_word_start(slice: RopeSlice, mut begin: usize, count: usize) -> end = begin; - with_end = skip_over_prev(slice, &mut end, is_horiz_blank); + with_end = skip_over_prev(slice, &mut end, char::is_whitespace); // refetch let ch = slice.char(end); if is_word(ch) { with_end = skip_over_prev(slice, &mut end, is_word); - } else if ch.is_ascii_punctuation() { - with_end = skip_over_prev(slice, &mut end, |ch| ch.is_ascii_punctuation()); + } else if is_punctuation(ch) { + with_end = skip_over_prev(slice, &mut end, is_punctuation); } } @@ -155,15 +155,15 @@ pub fn move_next_word_end(slice: RopeSlice, mut begin: usize, count: usize) -> O end = begin; - skip_over_next(slice, &mut end, is_horiz_blank); + skip_over_next(slice, &mut end, char::is_whitespace); // refetch let ch = slice.char(end); if is_word(ch) { skip_over_next(slice, &mut end, is_word); - } else if ch.is_ascii_punctuation() { - skip_over_next(slice, &mut end, |ch| ch.is_ascii_punctuation()); + } else if is_punctuation(ch) { + skip_over_next(slice, &mut end, is_punctuation); } } @@ -174,12 +174,28 @@ pub fn move_next_word_end(slice: RopeSlice, mut begin: usize, count: usize) -> O // used for by-word movement +#[inline] pub(crate) fn is_word(ch: char) -> bool { ch.is_alphanumeric() || ch == '_' } -pub(crate) fn is_horiz_blank(ch: char) -> bool { - matches!(ch, ' ' | '\t') +#[inline] +pub(crate) fn is_punctuation(ch: char) -> bool { + use unicode_general_category::{get_general_category, GeneralCategory}; + + matches!( + get_general_category(ch), + GeneralCategory::OtherPunctuation + | GeneralCategory::OpenPunctuation + | GeneralCategory::ClosePunctuation + | GeneralCategory::InitialPunctuation + | GeneralCategory::FinalPunctuation + | GeneralCategory::ConnectorPunctuation + | GeneralCategory::DashPunctuation + | GeneralCategory::MathSymbol + | GeneralCategory::CurrencySymbol + | GeneralCategory::ModifierSymbol + ) } #[derive(Debug, Eq, PartialEq)] @@ -191,14 +207,15 @@ pub(crate) enum Category { Unknown, } +#[inline] pub(crate) fn categorize(ch: char) -> Category { if ch == '\n' { Category::Eol - } else if ch.is_ascii_whitespace() { + } else if ch.is_whitespace() { Category::Whitespace } else if is_word(ch) { Category::Word - } else if ch.is_ascii_punctuation() { + } else if is_punctuation(ch) { Category::Punctuation } else { Category::Unknown @@ -213,6 +230,7 @@ where { let mut chars = slice.chars_at(*pos); + #[allow(clippy::while_let_on_iterator)] while let Some(ch) = chars.next() { if !fun(ch) { break; @@ -231,6 +249,7 @@ where // need to +1 so that prev() includes current char let mut chars = slice.chars_at(*pos + 1); + #[allow(clippy::while_let_on_iterator)] while let Some(ch) = chars.prev() { if !fun(ch) { break; @@ -259,4 +278,44 @@ mod test { (1, 2).into() ); } + + #[test] + fn test_categorize() { + const WORD_TEST_CASE: &'static str = + "_hello_world_あいうえおー12345678901234567890"; + const PUNCTUATION_TEST_CASE: &'static str = "!\"#$%&\'()*+,-./:;<=>?@[\\]^`{|}~!”#$%&’()*+、。:;<=>?@「」^`{|}~"; + const WHITESPACE_TEST_CASE: &'static str = " "; + + assert_eq!(Category::Eol, categorize('\n')); + + for ch in WHITESPACE_TEST_CASE.chars() { + assert_eq!( + Category::Whitespace, + categorize(ch), + "Testing '{}', but got `{:?}` instead of `Category::Whitespace`", + ch, + categorize(ch) + ); + } + + for ch in WORD_TEST_CASE.chars() { + assert_eq!( + Category::Word, + categorize(ch), + "Testing '{}', but got `{:?}` instead of `Category::Word`", + ch, + categorize(ch) + ); + } + + for ch in PUNCTUATION_TEST_CASE.chars() { + assert_eq!( + Category::Punctuation, + categorize(ch), + "Testing '{}', but got `{:?}` instead of `Category::Punctuation`", + ch, + categorize(ch) + ); + } + } } diff --git a/helix-core/src/transaction.rs b/helix-core/src/transaction.rs index 085f40b7..3a719628 100644 --- a/helix-core/src/transaction.rs +++ b/helix-core/src/transaction.rs @@ -758,7 +758,7 @@ mod test { #[test] fn combine_with_utf8() { - const TEST_CASE: &'static str = "Hello, これはヒレクスエディターです!"; + const TEST_CASE: &'static str = "Hello, これはヘリックスエディターです!"; let empty = Rope::from(""); let mut a = ChangeSet::new(&empty); diff --git a/helix-core/src/words.rs b/helix-core/src/words.rs index 5ff25050..2cbd88d4 100644 --- a/helix-core/src/words.rs +++ b/helix-core/src/words.rs @@ -1,4 +1,4 @@ -use crate::movement::{categorize, is_horiz_blank, is_word, skip_over_prev}; +use crate::movement::{categorize, is_punctuation, is_word, skip_over_prev}; use ropey::RopeSlice; #[must_use] @@ -13,15 +13,15 @@ pub fn nth_prev_word_boundary(slice: RopeSlice, mut char_idx: usize, count: usiz // return if not skip while? skip_over_prev(slice, &mut char_idx, |ch| ch == '\n'); - with_end = skip_over_prev(slice, &mut char_idx, is_horiz_blank); + with_end = skip_over_prev(slice, &mut char_idx, char::is_whitespace); // refetch let ch = slice.char(char_idx); if is_word(ch) { with_end = skip_over_prev(slice, &mut char_idx, is_word); - } else if ch.is_ascii_punctuation() { - with_end = skip_over_prev(slice, &mut char_idx, |ch| ch.is_ascii_punctuation()); + } else if is_punctuation(ch) { + with_end = skip_over_prev(slice, &mut char_idx, is_punctuation); } } @@ -47,11 +47,11 @@ fn different_prev_word_boundary() { t("hello, world", "hello, "); t("hello, ", "hello"); t("hello", ""); - t("こんにちは、世界!", "こんにちは、世界!"); // TODO: punctuation + t("こんにちは、世界!", "こんにちは、世界"); t("こんにちは、世界", "こんにちは、"); - t("こんにちは、", "こんにちは、"); // what? + t("こんにちは、", "こんにちは"); t("こんにちは", ""); - t("この世界。", "この世界。"); // what? + t("この世界。", "この世界"); t("この世界", ""); t("お前はもう死んでいる", ""); t("その300円です", ""); // TODO: should stop at 300 |