From b873fb9897bb5b24a60cca3d9fa69285446a857f Mon Sep 17 00:00:00 2001 From: Kirawi Date: Tue, 8 Jun 2021 00:20:15 -0400 Subject: Fix Unicode (#135) * init * wip * wip * fix unicode break * fix unicode break * Update helix-core/src/transaction.rs Co-authored-by: Benoît Cortier * clippy * fix * add changes * added test * wip * wip * wip * wip * fix * fix view * fix #88 Co-authored-by: Benoît Cortier --- Cargo.lock | 7 ++++ helix-core/Cargo.toml | 1 + helix-core/src/movement.rs | 85 ++++++++++++++++++++++++++++++++++++------- helix-core/src/transaction.rs | 2 +- helix-core/src/words.rs | 14 +++---- helix-term/src/commands.rs | 9 +++-- helix-view/src/view.rs | 2 +- 7 files changed, 94 insertions(+), 26 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7e1db781..6d51546a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -265,6 +265,7 @@ dependencies = [ "tendril", "toml", "tree-sitter", + "unicode-general-category", "unicode-segmentation", "unicode-width", ] @@ -969,6 +970,12 @@ dependencies = [ "matches", ] +[[package]] +name = "unicode-general-category" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07547e3ee45e28326cc23faac56d44f58f16ab23e413db526debce3b0bfd2742" + [[package]] name = "unicode-normalization" version = "0.1.19" diff --git a/helix-core/Cargo.toml b/helix-core/Cargo.toml index c544e108..2d7a6ae8 100644 --- a/helix-core/Cargo.toml +++ b/helix-core/Cargo.toml @@ -19,6 +19,7 @@ smallvec = "1.4" tendril = "0.4.2" unicode-segmentation = "1.6" unicode-width = "0.1" +unicode-general-category = "0.4.0" # slab = "0.4.2" tree-sitter = "0.19" once_cell = "1.4" diff --git a/helix-core/src/movement.rs b/helix-core/src/movement.rs index 96bbd54b..9d62b16c 100644 --- a/helix-core/src/movement.rs +++ b/helix-core/src/movement.rs @@ -88,11 +88,11 @@ pub fn move_next_word_start(slice: RopeSlice, mut begin: usize, count: usize) -> if is_word(ch) { skip_over_next(slice, &mut end, is_word); - } else if ch.is_ascii_punctuation() { - skip_over_next(slice, &mut end, |ch| ch.is_ascii_punctuation()); + } else if is_punctuation(ch) { + skip_over_next(slice, &mut end, is_punctuation); } - skip_over_next(slice, &mut end, is_horiz_blank); + skip_over_next(slice, &mut end, char::is_whitespace); } Some(Range::new(begin, end - 1)) @@ -119,15 +119,15 @@ pub fn move_prev_word_start(slice: RopeSlice, mut begin: usize, count: usize) -> end = begin; - with_end = skip_over_prev(slice, &mut end, is_horiz_blank); + with_end = skip_over_prev(slice, &mut end, char::is_whitespace); // refetch let ch = slice.char(end); if is_word(ch) { with_end = skip_over_prev(slice, &mut end, is_word); - } else if ch.is_ascii_punctuation() { - with_end = skip_over_prev(slice, &mut end, |ch| ch.is_ascii_punctuation()); + } else if is_punctuation(ch) { + with_end = skip_over_prev(slice, &mut end, is_punctuation); } } @@ -155,15 +155,15 @@ pub fn move_next_word_end(slice: RopeSlice, mut begin: usize, count: usize) -> O end = begin; - skip_over_next(slice, &mut end, is_horiz_blank); + skip_over_next(slice, &mut end, char::is_whitespace); // refetch let ch = slice.char(end); if is_word(ch) { skip_over_next(slice, &mut end, is_word); - } else if ch.is_ascii_punctuation() { - skip_over_next(slice, &mut end, |ch| ch.is_ascii_punctuation()); + } else if is_punctuation(ch) { + skip_over_next(slice, &mut end, is_punctuation); } } @@ -174,12 +174,28 @@ pub fn move_next_word_end(slice: RopeSlice, mut begin: usize, count: usize) -> O // used for by-word movement +#[inline] pub(crate) fn is_word(ch: char) -> bool { ch.is_alphanumeric() || ch == '_' } -pub(crate) fn is_horiz_blank(ch: char) -> bool { - matches!(ch, ' ' | '\t') +#[inline] +pub(crate) fn is_punctuation(ch: char) -> bool { + use unicode_general_category::{get_general_category, GeneralCategory}; + + matches!( + get_general_category(ch), + GeneralCategory::OtherPunctuation + | GeneralCategory::OpenPunctuation + | GeneralCategory::ClosePunctuation + | GeneralCategory::InitialPunctuation + | GeneralCategory::FinalPunctuation + | GeneralCategory::ConnectorPunctuation + | GeneralCategory::DashPunctuation + | GeneralCategory::MathSymbol + | GeneralCategory::CurrencySymbol + | GeneralCategory::ModifierSymbol + ) } #[derive(Debug, Eq, PartialEq)] @@ -191,14 +207,15 @@ pub(crate) enum Category { Unknown, } +#[inline] pub(crate) fn categorize(ch: char) -> Category { if ch == '\n' { Category::Eol - } else if ch.is_ascii_whitespace() { + } else if ch.is_whitespace() { Category::Whitespace } else if is_word(ch) { Category::Word - } else if ch.is_ascii_punctuation() { + } else if is_punctuation(ch) { Category::Punctuation } else { Category::Unknown @@ -213,6 +230,7 @@ where { let mut chars = slice.chars_at(*pos); + #[allow(clippy::while_let_on_iterator)] while let Some(ch) = chars.next() { if !fun(ch) { break; @@ -231,6 +249,7 @@ where // need to +1 so that prev() includes current char let mut chars = slice.chars_at(*pos + 1); + #[allow(clippy::while_let_on_iterator)] while let Some(ch) = chars.prev() { if !fun(ch) { break; @@ -259,4 +278,44 @@ mod test { (1, 2).into() ); } + + #[test] + fn test_categorize() { + const WORD_TEST_CASE: &'static str = + "_hello_world_あいうえおー12345678901234567890"; + const PUNCTUATION_TEST_CASE: &'static str = "!\"#$%&\'()*+,-./:;<=>?@[\\]^`{|}~!”#$%&’()*+、。:;<=>?@「」^`{|}~"; + const WHITESPACE_TEST_CASE: &'static str = "      "; + + assert_eq!(Category::Eol, categorize('\n')); + + for ch in WHITESPACE_TEST_CASE.chars() { + assert_eq!( + Category::Whitespace, + categorize(ch), + "Testing '{}', but got `{:?}` instead of `Category::Whitespace`", + ch, + categorize(ch) + ); + } + + for ch in WORD_TEST_CASE.chars() { + assert_eq!( + Category::Word, + categorize(ch), + "Testing '{}', but got `{:?}` instead of `Category::Word`", + ch, + categorize(ch) + ); + } + + for ch in PUNCTUATION_TEST_CASE.chars() { + assert_eq!( + Category::Punctuation, + categorize(ch), + "Testing '{}', but got `{:?}` instead of `Category::Punctuation`", + ch, + categorize(ch) + ); + } + } } diff --git a/helix-core/src/transaction.rs b/helix-core/src/transaction.rs index 085f40b7..3a719628 100644 --- a/helix-core/src/transaction.rs +++ b/helix-core/src/transaction.rs @@ -758,7 +758,7 @@ mod test { #[test] fn combine_with_utf8() { - const TEST_CASE: &'static str = "Hello, これはヒレクスエディターです!"; + const TEST_CASE: &'static str = "Hello, これはヘリックスエディターです!"; let empty = Rope::from(""); let mut a = ChangeSet::new(&empty); diff --git a/helix-core/src/words.rs b/helix-core/src/words.rs index 5ff25050..2cbd88d4 100644 --- a/helix-core/src/words.rs +++ b/helix-core/src/words.rs @@ -1,4 +1,4 @@ -use crate::movement::{categorize, is_horiz_blank, is_word, skip_over_prev}; +use crate::movement::{categorize, is_punctuation, is_word, skip_over_prev}; use ropey::RopeSlice; #[must_use] @@ -13,15 +13,15 @@ pub fn nth_prev_word_boundary(slice: RopeSlice, mut char_idx: usize, count: usiz // return if not skip while? skip_over_prev(slice, &mut char_idx, |ch| ch == '\n'); - with_end = skip_over_prev(slice, &mut char_idx, is_horiz_blank); + with_end = skip_over_prev(slice, &mut char_idx, char::is_whitespace); // refetch let ch = slice.char(char_idx); if is_word(ch) { with_end = skip_over_prev(slice, &mut char_idx, is_word); - } else if ch.is_ascii_punctuation() { - with_end = skip_over_prev(slice, &mut char_idx, |ch| ch.is_ascii_punctuation()); + } else if is_punctuation(ch) { + with_end = skip_over_prev(slice, &mut char_idx, is_punctuation); } } @@ -47,11 +47,11 @@ fn different_prev_word_boundary() { t("hello, world", "hello, "); t("hello, ", "hello"); t("hello", ""); - t("こんにちは、世界!", "こんにちは、世界!"); // TODO: punctuation + t("こんにちは、世界!", "こんにちは、世界"); t("こんにちは、世界", "こんにちは、"); - t("こんにちは、", "こんにちは、"); // what? + t("こんにちは、", "こんにちは"); t("こんにちは", ""); - t("この世界。", "この世界。"); // what? + t("この世界。", "この世界"); t("この世界", ""); t("お前はもう死んでいる", ""); t("その300円です", ""); // TODO: should stop at 300 diff --git a/helix-term/src/commands.rs b/helix-term/src/commands.rs index 0da23fc7..2412e55d 100644 --- a/helix-term/src/commands.rs +++ b/helix-term/src/commands.rs @@ -654,9 +654,10 @@ pub fn split_selection_on_newline(cx: &mut Context) { fn _search(doc: &mut Document, view: &mut View, contents: &str, regex: &Regex, extend: bool) { let text = doc.text(); let selection = doc.selection(view.id); - let start = selection.cursor(); + let start = text.char_to_byte(selection.cursor()); // use find_at to find the next match after the cursor, loop around the end + // Careful, `Regex` uses `bytes` as offsets, not character indices! let mat = regex .find_at(contents, start) .or_else(|| regex.find(contents)); @@ -670,7 +671,7 @@ fn _search(doc: &mut Document, view: &mut View, contents: &str, regex: &Regex, e return; } - let head = end; + let head = end - 1; let selection = if extend { selection.clone().push(Range::new(start, head)) @@ -1027,7 +1028,7 @@ pub fn command_mode(cx: &mut Context) { let mut prompt = Prompt::new( ":".to_owned(), |input: &str| { - // we use .this over split_ascii_whitespace() because we care about empty segments + // we use .this over split_whitespace() because we care about empty segments let parts = input.split(' ').collect::>(); // simple heuristic: if there's no just one part, complete command name. @@ -1069,7 +1070,7 @@ pub fn command_mode(cx: &mut Context) { return; } - let parts = input.split_ascii_whitespace().collect::>(); + let parts = input.split_whitespace().collect::>(); if parts.is_empty() { return; } diff --git a/helix-view/src/view.rs b/helix-view/src/view.rs index 8eccb9ef..b7bfaa17 100644 --- a/helix-view/src/view.rs +++ b/helix-view/src/view.rs @@ -106,7 +106,7 @@ impl View { /// Calculates the last visible line on screen #[inline] pub fn last_line(&self, doc: &Document) -> usize { - let height = self.area.height.saturating_sub(2); // - 2 for statusline + let height = self.area.height.saturating_sub(1); // - 1 for statusline std::cmp::min( self.first_line + height as usize, doc.text().len_lines() - 1, -- cgit v1.2.3-70-g09d2