diff options
author | Blaž Hrastnik | 2021-06-22 02:09:19 +0000 |
---|---|---|
committer | GitHub | 2021-06-22 02:09:19 +0000 |
commit | a70de6e980ec58cabf58c33e8b91bfafbea312eb (patch) | |
tree | 476c07b84ee3f399eb55c8b549641a59eedc4e1c /helix-core/src | |
parent | c704970fd71a1a29ef8397ff2ab9e12c5b780a81 (diff) | |
parent | f2954fa153ccb6b147d8d38020341a2f1b0b6df2 (diff) |
Merge pull request #224 from helix-editor/line_ending_detection
Line ending detection
Diffstat (limited to 'helix-core/src')
-rw-r--r-- | helix-core/src/auto_pairs.rs | 2 | ||||
-rw-r--r-- | helix-core/src/chars.rs | 122 | ||||
-rw-r--r-- | helix-core/src/lib.rs | 3 | ||||
-rw-r--r-- | helix-core/src/line_ending.rs | 252 | ||||
-rw-r--r-- | helix-core/src/movement.rs | 166 | ||||
-rw-r--r-- | helix-core/src/position.rs | 6 | ||||
-rw-r--r-- | helix-core/src/syntax.rs | 9 |
7 files changed, 410 insertions, 150 deletions
diff --git a/helix-core/src/auto_pairs.rs b/helix-core/src/auto_pairs.rs index 74e25ac9..746f201a 100644 --- a/helix-core/src/auto_pairs.rs +++ b/helix-core/src/auto_pairs.rs @@ -12,7 +12,7 @@ pub const PAIRS: &[(char, char)] = &[ ('`', '`'), ]; -const CLOSE_BEFORE: &str = ")]}'\":;> \n"; // includes space and newline +const CLOSE_BEFORE: &str = ")]}'\":;> \n\r\u{000B}\u{000C}\u{0085}\u{2028}\u{2029}"; // includes space and newlines // insert hook: // Fn(doc, selection, char) => Option<Transaction> diff --git a/helix-core/src/chars.rs b/helix-core/src/chars.rs index 243a1374..24133dd3 100644 --- a/helix-core/src/chars.rs +++ b/helix-core/src/chars.rs @@ -1,25 +1,44 @@ -/// Determine whether a character is a line break. -pub fn char_is_linebreak(c: char) -> bool { - matches!( - c, - '\u{000A}' | // LineFeed - '\u{000B}' | // VerticalTab - '\u{000C}' | // FormFeed - '\u{000D}' | // CarriageReturn - '\u{0085}' | // NextLine - '\u{2028}' | // Line Separator - '\u{2029}' // ParagraphSeparator - ) +use crate::LineEnding; + +#[derive(Debug, Eq, PartialEq)] +pub enum CharCategory { + Whitespace, + Eol, + Word, + Punctuation, + Unknown, +} + +#[inline] +pub fn categorize_char(ch: char) -> CharCategory { + if char_is_line_ending(ch) { + CharCategory::Eol + } else if ch.is_whitespace() { + CharCategory::Whitespace + } else if char_is_word(ch) { + CharCategory::Word + } else if char_is_punctuation(ch) { + CharCategory::Punctuation + } else { + CharCategory::Unknown + } +} + +/// Determine whether a character is a line ending. +#[inline] +pub fn char_is_line_ending(ch: char) -> bool { + LineEnding::from_char(ch).is_some() } /// Determine whether a character qualifies as (non-line-break) /// whitespace. -pub fn char_is_whitespace(c: char) -> bool { +#[inline] +pub fn char_is_whitespace(ch: char) -> bool { // TODO: this is a naive binary categorization of whitespace // characters. For display, word wrapping, etc. we'll need a better // categorization based on e.g. breaking vs non-breaking spaces // and whether they're zero-width or not. - match c { + match ch { //'\u{1680}' | // Ogham Space Mark (here for completeness, but usually displayed as a dash, not as whitespace) '\u{0009}' | // Character Tabulation '\u{0020}' | // Space @@ -34,8 +53,81 @@ pub fn char_is_whitespace(c: char) -> bool { // En Quad, Em Quad, En Space, Em Space, Three-per-em Space, // Four-per-em Space, Six-per-em Space, Figure Space, // Punctuation Space, Thin Space, Hair Space, Zero Width Space. - c if ('\u{2000}' ..= '\u{200B}').contains(&c) => true, + ch if ('\u{2000}' ..= '\u{200B}').contains(&ch) => true, _ => false, } } + +#[inline] +pub fn char_is_punctuation(ch: char) -> bool { + use unicode_general_category::{get_general_category, GeneralCategory}; + + matches!( + get_general_category(ch), + GeneralCategory::OtherPunctuation + | GeneralCategory::OpenPunctuation + | GeneralCategory::ClosePunctuation + | GeneralCategory::InitialPunctuation + | GeneralCategory::FinalPunctuation + | GeneralCategory::ConnectorPunctuation + | GeneralCategory::DashPunctuation + | GeneralCategory::MathSymbol + | GeneralCategory::CurrencySymbol + | GeneralCategory::ModifierSymbol + ) +} + +#[inline] +pub fn char_is_word(ch: char) -> bool { + ch.is_alphanumeric() || ch == '_' +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_categorize() { + const EOL_TEST_CASE: &'static str = "\n\r\u{000B}\u{000C}\u{0085}\u{2028}\u{2029}"; + const WORD_TEST_CASE: &'static str = + "_hello_world_あいうえおー12345678901234567890"; + const PUNCTUATION_TEST_CASE: &'static str = + "!\"#$%&\'()*+,-./:;<=>?@[\\]^`{|}~!”#$%&’()*+、。:;<=>?@「」^`{|}~"; + const WHITESPACE_TEST_CASE: &'static str = " "; + + for ch in EOL_TEST_CASE.chars() { + assert_eq!(CharCategory::Eol, categorize_char(ch)); + } + + for ch in WHITESPACE_TEST_CASE.chars() { + assert_eq!( + CharCategory::Whitespace, + categorize_char(ch), + "Testing '{}', but got `{:?}` instead of `Category::Whitespace`", + ch, + categorize_char(ch) + ); + } + + for ch in WORD_TEST_CASE.chars() { + assert_eq!( + CharCategory::Word, + categorize_char(ch), + "Testing '{}', but got `{:?}` instead of `Category::Word`", + ch, + categorize_char(ch) + ); + } + + for ch in PUNCTUATION_TEST_CASE.chars() { + assert_eq!( + CharCategory::Punctuation, + categorize_char(ch), + "Testing '{}', but got `{:?}` instead of `Category::Punctuation`", + ch, + categorize_char(ch) + ); + } + } +} diff --git a/helix-core/src/lib.rs b/helix-core/src/lib.rs index 4a9ac891..f697bc7f 100644 --- a/helix-core/src/lib.rs +++ b/helix-core/src/lib.rs @@ -6,6 +6,7 @@ pub mod diagnostic; pub mod graphemes; pub mod history; pub mod indent; +pub mod line_ending; pub mod macros; pub mod match_brackets; pub mod movement; @@ -106,6 +107,7 @@ pub use tendril::StrTendril as Tendril; #[doc(inline)] pub use {regex, tree_sitter}; +pub use graphemes::RopeGraphemes; pub use position::{coords_at_pos, pos_at_coords, Position}; pub use selection::{Range, Selection}; pub use smallvec::SmallVec; @@ -114,4 +116,5 @@ pub use syntax::Syntax; pub use diagnostic::Diagnostic; pub use state::State; +pub use line_ending::{LineEnding, DEFAULT_LINE_ENDING}; pub use transaction::{Assoc, Change, ChangeSet, Operation, Transaction}; diff --git a/helix-core/src/line_ending.rs b/helix-core/src/line_ending.rs new file mode 100644 index 00000000..fa33204c --- /dev/null +++ b/helix-core/src/line_ending.rs @@ -0,0 +1,252 @@ +use crate::{Rope, RopeGraphemes, RopeSlice}; + +#[cfg(target_os = "windows")] +pub const DEFAULT_LINE_ENDING: LineEnding = LineEnding::Crlf; +#[cfg(not(target_os = "windows"))] +pub const DEFAULT_LINE_ENDING: LineEnding = LineEnding::LF; + +/// Represents one of the valid Unicode line endings. +#[derive(PartialEq, Copy, Clone, Debug)] +pub enum LineEnding { + Crlf, // CarriageReturn followed by LineFeed + LF, // U+000A -- LineFeed + VT, // U+000B -- VerticalTab + FF, // U+000C -- FormFeed + CR, // U+000D -- CarriageReturn + Nel, // U+0085 -- NextLine + LS, // U+2028 -- Line Separator + PS, // U+2029 -- ParagraphSeparator +} + +impl LineEnding { + #[inline] + pub fn len_chars(&self) -> usize { + match self { + Self::Crlf => 2, + _ => 1, + } + } + + #[inline] + pub fn as_str(&self) -> &'static str { + match self { + Self::Crlf => "\u{000D}\u{000A}", + Self::LF => "\u{000A}", + Self::VT => "\u{000B}", + Self::FF => "\u{000C}", + Self::CR => "\u{000D}", + Self::Nel => "\u{0085}", + Self::LS => "\u{2028}", + Self::PS => "\u{2029}", + } + } + + #[inline] + pub fn from_char(ch: char) -> Option<LineEnding> { + match ch { + '\u{000A}' => Some(LineEnding::LF), + '\u{000B}' => Some(LineEnding::VT), + '\u{000C}' => Some(LineEnding::FF), + '\u{000D}' => Some(LineEnding::CR), + '\u{0085}' => Some(LineEnding::Nel), + '\u{2028}' => Some(LineEnding::LS), + '\u{2029}' => Some(LineEnding::PS), + // Not a line ending + _ => None, + } + } + + // Normally we'd want to implement the FromStr trait, but in this case + // that would force us into a different return type than from_char or + // or from_rope_slice, which would be weird. + #[allow(clippy::should_implement_trait)] + #[inline] + pub fn from_str(g: &str) -> Option<LineEnding> { + match g { + "\u{000D}\u{000A}" => Some(LineEnding::Crlf), + "\u{000A}" => Some(LineEnding::LF), + "\u{000B}" => Some(LineEnding::VT), + "\u{000C}" => Some(LineEnding::FF), + "\u{000D}" => Some(LineEnding::CR), + "\u{0085}" => Some(LineEnding::Nel), + "\u{2028}" => Some(LineEnding::LS), + "\u{2029}" => Some(LineEnding::PS), + // Not a line ending + _ => None, + } + } + + #[inline] + pub fn from_rope_slice(g: &RopeSlice) -> Option<LineEnding> { + if let Some(text) = g.as_str() { + LineEnding::from_str(text) + } else { + // Non-contiguous, so it can't be a line ending. + // Specifically, Ropey guarantees that CRLF is always + // contiguous. And the remaining line endings are all + // single `char`s, and therefore trivially contiguous. + None + } + } +} + +#[inline] +pub fn str_is_line_ending(s: &str) -> bool { + LineEnding::from_str(s).is_some() +} + +/// Attempts to detect what line ending the passed document uses. +pub fn auto_detect_line_ending(doc: &Rope) -> Option<LineEnding> { + // Return first matched line ending. Not all possible line endings + // are being matched, as they might be special-use only + for line in doc.lines().take(100) { + match get_line_ending(&line) { + None | Some(LineEnding::VT) | Some(LineEnding::FF) | Some(LineEnding::PS) => {} + ending => return ending, + } + } + None +} + +/// Returns the passed line's line ending, if any. +pub fn get_line_ending(line: &RopeSlice) -> Option<LineEnding> { + // Last character as str. + let g1 = line + .slice(line.len_chars().saturating_sub(1)..) + .as_str() + .unwrap(); + + // Last two characters as str, or empty str if they're not contiguous. + // It's fine to punt on the non-contiguous case, because Ropey guarantees + // that CRLF is always contiguous. + let g2 = line + .slice(line.len_chars().saturating_sub(2)..) + .as_str() + .unwrap_or(""); + + // First check the two-character case for CRLF, then check the single-character case. + LineEnding::from_str(g2).or_else(|| LineEnding::from_str(g1)) +} + +/// Returns the passed line's line ending, if any. +pub fn get_line_ending_of_str(line: &str) -> Option<LineEnding> { + if line.ends_with("\u{000D}\u{000A}") { + Some(LineEnding::Crlf) + } else if line.ends_with('\u{000A}') { + Some(LineEnding::LF) + } else if line.ends_with('\u{000B}') { + Some(LineEnding::VT) + } else if line.ends_with('\u{000C}') { + Some(LineEnding::FF) + } else if line.ends_with('\u{000D}') { + Some(LineEnding::CR) + } else if line.ends_with('\u{0085}') { + Some(LineEnding::Nel) + } else if line.ends_with('\u{2028}') { + Some(LineEnding::LS) + } else if line.ends_with('\u{2029}') { + Some(LineEnding::PS) + } else { + None + } +} + +/// Returns the char index of the end of the given line, not including its line ending. +pub fn line_end_char_index(slice: &RopeSlice, line: usize) -> usize { + slice.line_to_char(line + 1) + - get_line_ending(&slice.line(line)) + .map(|le| le.len_chars()) + .unwrap_or(0) +} + +#[cfg(test)] +mod line_ending_tests { + use super::*; + + #[test] + fn line_ending_autodetect() { + assert_eq!( + auto_detect_line_ending(&Rope::from_str("\n")), + Some(LineEnding::LF) + ); + assert_eq!( + auto_detect_line_ending(&Rope::from_str("\r\n")), + Some(LineEnding::Crlf) + ); + assert_eq!(auto_detect_line_ending(&Rope::from_str("hello")), None); + assert_eq!(auto_detect_line_ending(&Rope::from_str("")), None); + assert_eq!( + auto_detect_line_ending(&Rope::from_str("hello\nhelix\r\n")), + Some(LineEnding::LF) + ); + assert_eq!( + auto_detect_line_ending(&Rope::from_str("a formfeed\u{000C}")), + None + ); + assert_eq!( + auto_detect_line_ending(&Rope::from_str("\n\u{000A}\n \u{000A}")), + Some(LineEnding::LF) + ); + assert_eq!( + auto_detect_line_ending(&Rope::from_str( + "a formfeed\u{000C} with a\u{000C} linefeed\u{000A}" + )), + Some(LineEnding::LF) + ); + assert_eq!(auto_detect_line_ending(&Rope::from_str("a formfeed\u{000C} with a\u{000C} carriage return linefeed\u{000D}\u{000A} and a linefeed\u{000A}")), Some(LineEnding::Crlf)); + } + + #[test] + fn str_to_line_ending() { + assert_eq!(LineEnding::from_str("\r"), Some(LineEnding::CR)); + assert_eq!(LineEnding::from_str("\n"), Some(LineEnding::LF)); + assert_eq!(LineEnding::from_str("\r\n"), Some(LineEnding::Crlf)); + assert_eq!(LineEnding::from_str("hello\n"), None); + } + + #[test] + fn rope_slice_to_line_ending() { + let r = Rope::from_str("hello\r\n"); + assert_eq!( + LineEnding::from_rope_slice(&r.slice(5..6)), + Some(LineEnding::CR) + ); + assert_eq!( + LineEnding::from_rope_slice(&r.slice(6..7)), + Some(LineEnding::LF) + ); + assert_eq!( + LineEnding::from_rope_slice(&r.slice(5..7)), + Some(LineEnding::Crlf) + ); + assert_eq!(LineEnding::from_rope_slice(&r.slice(..)), None); + } + + #[test] + fn get_line_ending_rope_slice() { + let r = Rope::from_str("Hello\rworld\nhow\r\nare you?"); + assert_eq!(get_line_ending(&r.slice(..6)), Some(LineEnding::CR)); + assert_eq!(get_line_ending(&r.slice(..12)), Some(LineEnding::LF)); + assert_eq!(get_line_ending(&r.slice(..17)), Some(LineEnding::Crlf)); + assert_eq!(get_line_ending(&r.slice(..)), None); + } + + #[test] + fn get_line_ending_str() { + let text = "Hello\rworld\nhow\r\nare you?"; + assert_eq!(get_line_ending_of_str(&text[..6]), Some(LineEnding::CR)); + assert_eq!(get_line_ending_of_str(&text[..12]), Some(LineEnding::LF)); + assert_eq!(get_line_ending_of_str(&text[..17]), Some(LineEnding::Crlf)); + assert_eq!(get_line_ending_of_str(&text[..]), None); + } + + #[test] + fn line_end_char_index_rope_slice() { + let r = Rope::from_str("Hello\rworld\nhow\r\nare you?"); + let s = &r.slice(..); + assert_eq!(line_end_char_index(s, 0), 5); + assert_eq!(line_end_char_index(s, 1), 11); + assert_eq!(line_end_char_index(s, 2), 15); + assert_eq!(line_end_char_index(s, 3), 25); + } +} diff --git a/helix-core/src/movement.rs b/helix-core/src/movement.rs index 8b1e802f..bfceb4ef 100644 --- a/helix-core/src/movement.rs +++ b/helix-core/src/movement.rs @@ -3,8 +3,13 @@ use std::iter::{self, from_fn, Peekable, SkipWhile}; use ropey::iter::Chars; use crate::{ + chars::{ + categorize_char, char_is_line_ending, char_is_punctuation, char_is_whitespace, + char_is_word, CharCategory, + }, coords_at_pos, graphemes::{nth_next_grapheme_boundary, nth_prev_grapheme_boundary}, + line_ending::{get_line_ending, line_end_char_index}, pos_at_coords, Position, Range, RopeSlice, }; @@ -37,9 +42,8 @@ pub fn move_horizontally( nth_prev_grapheme_boundary(slice, pos, count).max(start) } Direction::Forward => { - // Line end is pos at the start of next line - 1 - let end = slice.line_to_char(line + 1).saturating_sub(1); - nth_next_grapheme_boundary(slice, pos, count).min(end) + let end_char_idx = line_end_char_index(&slice, line); + nth_next_grapheme_boundary(slice, pos, count).min(end_char_idx) } }; let anchor = match behaviour { @@ -68,8 +72,11 @@ pub fn move_vertically( ), }; - // convert to 0-indexed, subtract another 1 because len_chars() counts \n - let new_line_len = slice.line(new_line).len_chars().saturating_sub(2); + // Length of the line sans line-ending. + let new_line_len = { + let line = slice.line(new_line); + line.len_chars() - get_line_ending(&line).map(|le| le.len_chars()).unwrap_or(0) + }; let new_col = std::cmp::min(horiz as usize, new_line_len); @@ -104,64 +111,6 @@ fn word_move(slice: RopeSlice, mut range: Range, count: usize, target: WordMotio } // ---- util ------------ -#[inline] -pub(crate) fn is_word(ch: char) -> bool { - ch.is_alphanumeric() || ch == '_' -} - -#[inline] -pub(crate) fn is_end_of_line(ch: char) -> bool { - ch == '\n' -} - -#[inline] -// Whitespace, but not end of line -pub(crate) fn is_strict_whitespace(ch: char) -> bool { - ch.is_whitespace() && !is_end_of_line(ch) -} - -#[inline] -pub(crate) fn is_punctuation(ch: char) -> bool { - use unicode_general_category::{get_general_category, GeneralCategory}; - - matches!( - get_general_category(ch), - GeneralCategory::OtherPunctuation - | GeneralCategory::OpenPunctuation - | GeneralCategory::ClosePunctuation - | GeneralCategory::InitialPunctuation - | GeneralCategory::FinalPunctuation - | GeneralCategory::ConnectorPunctuation - | GeneralCategory::DashPunctuation - | GeneralCategory::MathSymbol - | GeneralCategory::CurrencySymbol - | GeneralCategory::ModifierSymbol - ) -} - -#[derive(Debug, Eq, PartialEq)] -pub enum Category { - Whitespace, - Eol, - Word, - Punctuation, - Unknown, -} - -#[inline] -pub(crate) fn categorize(ch: char) -> Category { - if is_end_of_line(ch) { - Category::Eol - } else if ch.is_whitespace() { - Category::Whitespace - } else if is_word(ch) { - Category::Word - } else if is_punctuation(ch) { - Category::Punctuation - } else { - Category::Unknown - } -} #[inline] /// Returns first index that doesn't satisfy a given predicate when @@ -235,7 +184,8 @@ impl CharHelpers for Chars<'_> { let mut phase = WordMotionPhase::Start; let mut head = origin.head; let mut anchor: Option<usize> = None; - let is_boundary = |a: char, b: Option<char>| categorize(a) != categorize(b.unwrap_or(a)); + let is_boundary = + |a: char, b: Option<char>| categorize_char(a) != categorize_char(b.unwrap_or(a)); while let Some(peek) = characters.peek().copied() { phase = match phase { WordMotionPhase::Start => { @@ -244,7 +194,8 @@ impl CharHelpers for Chars<'_> { break; // We're at the end, so there's nothing to do. } // Anchor may remain here if the head wasn't at a boundary - if !is_boundary(peek, characters.peek().copied()) && !is_end_of_line(peek) { + if !is_boundary(peek, characters.peek().copied()) && !char_is_line_ending(peek) + { anchor = Some(head); } // First character is always skipped by the head @@ -252,7 +203,7 @@ impl CharHelpers for Chars<'_> { WordMotionPhase::SkipNewlines } WordMotionPhase::SkipNewlines => { - if is_end_of_line(peek) { + if char_is_line_ending(peek) { characters.next(); if characters.peek().is_some() { advance(&mut head); @@ -286,12 +237,12 @@ fn reached_target(target: WordMotionTarget, peek: char, next_peek: Option<&char> match target { WordMotionTarget::NextWordStart => { - ((categorize(peek) != categorize(*next_peek)) - && (is_end_of_line(*next_peek) || !next_peek.is_whitespace())) + ((categorize_char(peek) != categorize_char(*next_peek)) + && (char_is_line_ending(*next_peek) || !next_peek.is_whitespace())) } WordMotionTarget::NextWordEnd | WordMotionTarget::PrevWordStart => { - ((categorize(peek) != categorize(*next_peek)) - && (!peek.is_whitespace() || is_end_of_line(*next_peek))) + ((categorize_char(peek) != categorize_char(*next_peek)) + && (!peek.is_whitespace() || char_is_line_ending(*next_peek))) } } } @@ -330,7 +281,7 @@ mod test { slice, move_vertically(slice, range, Direction::Forward, 1, Movement::Move).head ), - (1, 2).into() + (1, 3).into() ); } @@ -343,12 +294,12 @@ mod test { let mut range = Range::point(position); let moves_and_expected_coordinates = [ - ((Direction::Forward, 1usize), (0, 1)), - ((Direction::Forward, 2usize), (0, 3)), - ((Direction::Forward, 0usize), (0, 3)), - ((Direction::Forward, 999usize), (0, 31)), - ((Direction::Forward, 999usize), (0, 31)), - ((Direction::Backward, 999usize), (0, 0)), + ((Direction::Forward, 1usize), (0, 1)), // T|his is a simple alphabetic line + ((Direction::Forward, 2usize), (0, 3)), // Thi|s is a simple alphabetic line + ((Direction::Forward, 0usize), (0, 3)), // Thi|s is a simple alphabetic line + ((Direction::Forward, 999usize), (0, 32)), // This is a simple alphabetic line| + ((Direction::Forward, 999usize), (0, 32)), // This is a simple alphabetic line| + ((Direction::Backward, 999usize), (0, 0)), // |This is a simple alphabetic line ]; for ((direction, amount), coordinates) in IntoIter::new(moves_and_expected_coordinates) { @@ -366,15 +317,15 @@ mod test { let mut range = Range::point(position); let moves_and_expected_coordinates = IntoIter::new([ - ((Direction::Forward, 1usize), (0, 1)), // M_ltiline - ((Direction::Forward, 2usize), (0, 3)), // Mul_iline - ((Direction::Backward, 6usize), (0, 0)), // _ultiline - ((Direction::Backward, 999usize), (0, 0)), // _ultiline - ((Direction::Forward, 3usize), (0, 3)), // Mul_iline - ((Direction::Forward, 0usize), (0, 3)), // Mul_iline - ((Direction::Backward, 0usize), (0, 3)), // Mul_iline - ((Direction::Forward, 999usize), (0, 9)), // Multilin_ - ((Direction::Forward, 999usize), (0, 9)), // Multilin_ + ((Direction::Forward, 1usize), (0, 1)), // M|ultiline\n + ((Direction::Forward, 2usize), (0, 3)), // Mul|tiline\n + ((Direction::Backward, 6usize), (0, 0)), // |Multiline\n + ((Direction::Backward, 999usize), (0, 0)), // |Multiline\n + ((Direction::Forward, 3usize), (0, 3)), // Mul|tiline\n + ((Direction::Forward, 0usize), (0, 3)), // Mul|tiline\n + ((Direction::Backward, 0usize), (0, 3)), // Mul|tiline\n + ((Direction::Forward, 999usize), (0, 9)), // Multiline|\n + ((Direction::Forward, 999usize), (0, 9)), // Multiline|\n ]); for ((direction, amount), coordinates) in moves_and_expected_coordinates { @@ -446,7 +397,7 @@ mod test { // First descent preserves column as the target line is wider ((Axis::V, Direction::Forward, 1usize), (1, 8)), // Second descent clamps column as the target line is shorter - ((Axis::V, Direction::Forward, 1usize), (2, 4)), + ((Axis::V, Direction::Forward, 1usize), (2, 5)), // Third descent restores the original column ((Axis::V, Direction::Forward, 1usize), (3, 8)), // Behaviour is preserved even through long jumps @@ -760,45 +711,4 @@ mod test { } } } - - #[test] - fn test_categorize() { - const WORD_TEST_CASE: &'static str = - "_hello_world_あいうえおー12345678901234567890"; - const PUNCTUATION_TEST_CASE: &'static str = - "!\"#$%&\'()*+,-./:;<=>?@[\\]^`{|}~!”#$%&’()*+、。:;<=>?@「」^`{|}~"; - const WHITESPACE_TEST_CASE: &'static str = " "; - - assert_eq!(Category::Eol, categorize('\n')); - - for ch in WHITESPACE_TEST_CASE.chars() { - assert_eq!( - Category::Whitespace, - categorize(ch), - "Testing '{}', but got `{:?}` instead of `Category::Whitespace`", - ch, - categorize(ch) - ); - } - - for ch in WORD_TEST_CASE.chars() { - assert_eq!( - Category::Word, - categorize(ch), - "Testing '{}', but got `{:?}` instead of `Category::Word`", - ch, - categorize(ch) - ); - } - - for ch in PUNCTUATION_TEST_CASE.chars() { - assert_eq!( - Category::Punctuation, - categorize(ch), - "Testing '{}', but got `{:?}` instead of `Category::Punctuation`", - ch, - categorize(ch) - ); - } - } } diff --git a/helix-core/src/position.rs b/helix-core/src/position.rs index 3d85ff2f..392eee9c 100644 --- a/helix-core/src/position.rs +++ b/helix-core/src/position.rs @@ -1,4 +1,5 @@ use crate::{ + chars::char_is_line_ending, graphemes::{nth_next_grapheme_boundary, RopeGraphemes}, Rope, RopeSlice, }; @@ -23,8 +24,9 @@ impl Position { pub fn traverse(self, text: &crate::Tendril) -> Self { let Self { mut row, mut col } = self; // TODO: there should be a better way here - for ch in text.chars() { - if ch == '\n' { + let mut chars = text.chars().peekable(); + while let Some(ch) = chars.next() { + if char_is_line_ending(ch) && !(ch == '\r' && chars.peek() == Some(&'\n')) { row += 1; col = 0; } else { diff --git a/helix-core/src/syntax.rs b/helix-core/src/syntax.rs index 81b6d5a0..63ca424e 100644 --- a/helix-core/src/syntax.rs +++ b/helix-core/src/syntax.rs @@ -1,4 +1,4 @@ -use crate::{regex::Regex, Change, Rope, RopeSlice, Transaction}; +use crate::{chars::char_is_line_ending, regex::Regex, Change, Rope, RopeSlice, Transaction}; pub use helix_syntax::{get_language, get_language_name, Lang}; use arc_swap::ArcSwap; @@ -589,9 +589,10 @@ impl LanguageLayer { mut column, } = point; - // TODO: there should be a better way here - for ch in text.bytes() { - if ch == b'\n' { + // TODO: there should be a better way here. + let mut chars = text.chars().peekable(); + while let Some(ch) = chars.next() { + if char_is_line_ending(ch) && !(ch == '\r' && chars.peek() == Some(&'\n')) { row += 1; column = 0; } else { |