diff options
author | Blaž Hrastnik | 2021-06-22 02:09:19 +0000 |
---|---|---|
committer | GitHub | 2021-06-22 02:09:19 +0000 |
commit | a70de6e980ec58cabf58c33e8b91bfafbea312eb (patch) | |
tree | 476c07b84ee3f399eb55c8b549641a59eedc4e1c /helix-core/src/line_ending.rs | |
parent | c704970fd71a1a29ef8397ff2ab9e12c5b780a81 (diff) | |
parent | f2954fa153ccb6b147d8d38020341a2f1b0b6df2 (diff) |
Merge pull request #224 from helix-editor/line_ending_detection
Line ending detection
Diffstat (limited to 'helix-core/src/line_ending.rs')
-rw-r--r-- | helix-core/src/line_ending.rs | 252 |
1 files changed, 252 insertions, 0 deletions
diff --git a/helix-core/src/line_ending.rs b/helix-core/src/line_ending.rs new file mode 100644 index 00000000..fa33204c --- /dev/null +++ b/helix-core/src/line_ending.rs @@ -0,0 +1,252 @@ +use crate::{Rope, RopeGraphemes, RopeSlice}; + +#[cfg(target_os = "windows")] +pub const DEFAULT_LINE_ENDING: LineEnding = LineEnding::Crlf; +#[cfg(not(target_os = "windows"))] +pub const DEFAULT_LINE_ENDING: LineEnding = LineEnding::LF; + +/// Represents one of the valid Unicode line endings. +#[derive(PartialEq, Copy, Clone, Debug)] +pub enum LineEnding { + Crlf, // CarriageReturn followed by LineFeed + LF, // U+000A -- LineFeed + VT, // U+000B -- VerticalTab + FF, // U+000C -- FormFeed + CR, // U+000D -- CarriageReturn + Nel, // U+0085 -- NextLine + LS, // U+2028 -- Line Separator + PS, // U+2029 -- ParagraphSeparator +} + +impl LineEnding { + #[inline] + pub fn len_chars(&self) -> usize { + match self { + Self::Crlf => 2, + _ => 1, + } + } + + #[inline] + pub fn as_str(&self) -> &'static str { + match self { + Self::Crlf => "\u{000D}\u{000A}", + Self::LF => "\u{000A}", + Self::VT => "\u{000B}", + Self::FF => "\u{000C}", + Self::CR => "\u{000D}", + Self::Nel => "\u{0085}", + Self::LS => "\u{2028}", + Self::PS => "\u{2029}", + } + } + + #[inline] + pub fn from_char(ch: char) -> Option<LineEnding> { + match ch { + '\u{000A}' => Some(LineEnding::LF), + '\u{000B}' => Some(LineEnding::VT), + '\u{000C}' => Some(LineEnding::FF), + '\u{000D}' => Some(LineEnding::CR), + '\u{0085}' => Some(LineEnding::Nel), + '\u{2028}' => Some(LineEnding::LS), + '\u{2029}' => Some(LineEnding::PS), + // Not a line ending + _ => None, + } + } + + // Normally we'd want to implement the FromStr trait, but in this case + // that would force us into a different return type than from_char or + // or from_rope_slice, which would be weird. + #[allow(clippy::should_implement_trait)] + #[inline] + pub fn from_str(g: &str) -> Option<LineEnding> { + match g { + "\u{000D}\u{000A}" => Some(LineEnding::Crlf), + "\u{000A}" => Some(LineEnding::LF), + "\u{000B}" => Some(LineEnding::VT), + "\u{000C}" => Some(LineEnding::FF), + "\u{000D}" => Some(LineEnding::CR), + "\u{0085}" => Some(LineEnding::Nel), + "\u{2028}" => Some(LineEnding::LS), + "\u{2029}" => Some(LineEnding::PS), + // Not a line ending + _ => None, + } + } + + #[inline] + pub fn from_rope_slice(g: &RopeSlice) -> Option<LineEnding> { + if let Some(text) = g.as_str() { + LineEnding::from_str(text) + } else { + // Non-contiguous, so it can't be a line ending. + // Specifically, Ropey guarantees that CRLF is always + // contiguous. And the remaining line endings are all + // single `char`s, and therefore trivially contiguous. + None + } + } +} + +#[inline] +pub fn str_is_line_ending(s: &str) -> bool { + LineEnding::from_str(s).is_some() +} + +/// Attempts to detect what line ending the passed document uses. +pub fn auto_detect_line_ending(doc: &Rope) -> Option<LineEnding> { + // Return first matched line ending. Not all possible line endings + // are being matched, as they might be special-use only + for line in doc.lines().take(100) { + match get_line_ending(&line) { + None | Some(LineEnding::VT) | Some(LineEnding::FF) | Some(LineEnding::PS) => {} + ending => return ending, + } + } + None +} + +/// Returns the passed line's line ending, if any. +pub fn get_line_ending(line: &RopeSlice) -> Option<LineEnding> { + // Last character as str. + let g1 = line + .slice(line.len_chars().saturating_sub(1)..) + .as_str() + .unwrap(); + + // Last two characters as str, or empty str if they're not contiguous. + // It's fine to punt on the non-contiguous case, because Ropey guarantees + // that CRLF is always contiguous. + let g2 = line + .slice(line.len_chars().saturating_sub(2)..) + .as_str() + .unwrap_or(""); + + // First check the two-character case for CRLF, then check the single-character case. + LineEnding::from_str(g2).or_else(|| LineEnding::from_str(g1)) +} + +/// Returns the passed line's line ending, if any. +pub fn get_line_ending_of_str(line: &str) -> Option<LineEnding> { + if line.ends_with("\u{000D}\u{000A}") { + Some(LineEnding::Crlf) + } else if line.ends_with('\u{000A}') { + Some(LineEnding::LF) + } else if line.ends_with('\u{000B}') { + Some(LineEnding::VT) + } else if line.ends_with('\u{000C}') { + Some(LineEnding::FF) + } else if line.ends_with('\u{000D}') { + Some(LineEnding::CR) + } else if line.ends_with('\u{0085}') { + Some(LineEnding::Nel) + } else if line.ends_with('\u{2028}') { + Some(LineEnding::LS) + } else if line.ends_with('\u{2029}') { + Some(LineEnding::PS) + } else { + None + } +} + +/// Returns the char index of the end of the given line, not including its line ending. +pub fn line_end_char_index(slice: &RopeSlice, line: usize) -> usize { + slice.line_to_char(line + 1) + - get_line_ending(&slice.line(line)) + .map(|le| le.len_chars()) + .unwrap_or(0) +} + +#[cfg(test)] +mod line_ending_tests { + use super::*; + + #[test] + fn line_ending_autodetect() { + assert_eq!( + auto_detect_line_ending(&Rope::from_str("\n")), + Some(LineEnding::LF) + ); + assert_eq!( + auto_detect_line_ending(&Rope::from_str("\r\n")), + Some(LineEnding::Crlf) + ); + assert_eq!(auto_detect_line_ending(&Rope::from_str("hello")), None); + assert_eq!(auto_detect_line_ending(&Rope::from_str("")), None); + assert_eq!( + auto_detect_line_ending(&Rope::from_str("hello\nhelix\r\n")), + Some(LineEnding::LF) + ); + assert_eq!( + auto_detect_line_ending(&Rope::from_str("a formfeed\u{000C}")), + None + ); + assert_eq!( + auto_detect_line_ending(&Rope::from_str("\n\u{000A}\n \u{000A}")), + Some(LineEnding::LF) + ); + assert_eq!( + auto_detect_line_ending(&Rope::from_str( + "a formfeed\u{000C} with a\u{000C} linefeed\u{000A}" + )), + Some(LineEnding::LF) + ); + assert_eq!(auto_detect_line_ending(&Rope::from_str("a formfeed\u{000C} with a\u{000C} carriage return linefeed\u{000D}\u{000A} and a linefeed\u{000A}")), Some(LineEnding::Crlf)); + } + + #[test] + fn str_to_line_ending() { + assert_eq!(LineEnding::from_str("\r"), Some(LineEnding::CR)); + assert_eq!(LineEnding::from_str("\n"), Some(LineEnding::LF)); + assert_eq!(LineEnding::from_str("\r\n"), Some(LineEnding::Crlf)); + assert_eq!(LineEnding::from_str("hello\n"), None); + } + + #[test] + fn rope_slice_to_line_ending() { + let r = Rope::from_str("hello\r\n"); + assert_eq!( + LineEnding::from_rope_slice(&r.slice(5..6)), + Some(LineEnding::CR) + ); + assert_eq!( + LineEnding::from_rope_slice(&r.slice(6..7)), + Some(LineEnding::LF) + ); + assert_eq!( + LineEnding::from_rope_slice(&r.slice(5..7)), + Some(LineEnding::Crlf) + ); + assert_eq!(LineEnding::from_rope_slice(&r.slice(..)), None); + } + + #[test] + fn get_line_ending_rope_slice() { + let r = Rope::from_str("Hello\rworld\nhow\r\nare you?"); + assert_eq!(get_line_ending(&r.slice(..6)), Some(LineEnding::CR)); + assert_eq!(get_line_ending(&r.slice(..12)), Some(LineEnding::LF)); + assert_eq!(get_line_ending(&r.slice(..17)), Some(LineEnding::Crlf)); + assert_eq!(get_line_ending(&r.slice(..)), None); + } + + #[test] + fn get_line_ending_str() { + let text = "Hello\rworld\nhow\r\nare you?"; + assert_eq!(get_line_ending_of_str(&text[..6]), Some(LineEnding::CR)); + assert_eq!(get_line_ending_of_str(&text[..12]), Some(LineEnding::LF)); + assert_eq!(get_line_ending_of_str(&text[..17]), Some(LineEnding::Crlf)); + assert_eq!(get_line_ending_of_str(&text[..]), None); + } + + #[test] + fn line_end_char_index_rope_slice() { + let r = Rope::from_str("Hello\rworld\nhow\r\nare you?"); + let s = &r.slice(..); + assert_eq!(line_end_char_index(s, 0), 5); + assert_eq!(line_end_char_index(s, 1), 11); + assert_eq!(line_end_char_index(s, 2), 15); + assert_eq!(line_end_char_index(s, 3), 25); + } +} |