From b0b3f45b80931e73eadaf7e73f1981283b8e49fc Mon Sep 17 00:00:00 2001 From: Alexis-Lapierre Date: Mon, 1 May 2023 00:40:06 +0200 Subject: Conserve BOM and properly support UTF16 (#6497) --- helix-view/src/document.rs | 178 ++++++++++++++++++++++++++++++++++++--------- helix-view/src/editor.rs | 4 +- 2 files changed, 146 insertions(+), 36 deletions(-) (limited to 'helix-view') diff --git a/helix-view/src/document.rs b/helix-view/src/document.rs index 5ede5bc6..11b9ef0e 100644 --- a/helix-view/src/document.rs +++ b/helix-view/src/document.rs @@ -5,6 +5,7 @@ use futures_util::future::BoxFuture; use futures_util::FutureExt; use helix_core::auto_pairs::AutoPairs; use helix_core::doc_formatter::TextFormat; +use helix_core::encoding::Encoding; use helix_core::syntax::Highlight; use helix_core::text_annotations::{InlineAnnotation, TextAnnotations}; use helix_core::Range; @@ -130,6 +131,7 @@ pub struct Document { path: Option, encoding: &'static encoding::Encoding, + has_bom: bool, pub restore_cursor: bool, @@ -277,16 +279,104 @@ impl fmt::Debug for DocumentInlayHintsId { } } +enum Encoder { + Utf16Be, + Utf16Le, + EncodingRs(encoding::Encoder), +} + +impl Encoder { + fn from_encoding(encoding: &'static encoding::Encoding) -> Self { + if encoding == encoding::UTF_16BE { + Self::Utf16Be + } else if encoding == encoding::UTF_16LE { + Self::Utf16Le + } else { + Self::EncodingRs(encoding.new_encoder()) + } + } + + fn encode_from_utf8( + &mut self, + src: &str, + dst: &mut [u8], + is_empty: bool, + ) -> (encoding::CoderResult, usize, usize) { + if src.is_empty() { + return (encoding::CoderResult::InputEmpty, 0, 0); + } + let mut write_to_buf = |convert: fn(u16) -> [u8; 2]| { + let to_write = src.char_indices().map(|(indice, char)| { + let mut encoded: [u16; 2] = [0, 0]; + ( + indice, + char.encode_utf16(&mut encoded) + .iter_mut() + .flat_map(|char| convert(*char)) + .collect::>(), + ) + }); + + let mut total_written = 0usize; + + for (indice, utf16_bytes) in to_write { + let character_size = utf16_bytes.len(); + + if dst.len() <= (total_written + character_size) { + return (encoding::CoderResult::OutputFull, indice, total_written); + } + + for character in utf16_bytes { + dst[total_written] = character; + total_written += 1; + } + } + + (encoding::CoderResult::InputEmpty, src.len(), total_written) + }; + + match self { + Self::Utf16Be => write_to_buf(u16::to_be_bytes), + Self::Utf16Le => write_to_buf(u16::to_le_bytes), + Self::EncodingRs(encoder) => { + let (code_result, read, written, ..) = encoder.encode_from_utf8(src, dst, is_empty); + + (code_result, read, written) + } + } + } +} + +// Apply BOM if encoding permit it, return the number of bytes written at the start of buf +fn apply_bom(encoding: &'static encoding::Encoding, buf: &mut [u8; BUF_SIZE]) -> usize { + if encoding == encoding::UTF_8 { + buf[0] = 0xef; + buf[1] = 0xbb; + buf[2] = 0xbf; + 3 + } else if encoding == encoding::UTF_16BE { + buf[0] = 0xfe; + buf[1] = 0xff; + 2 + } else if encoding == encoding::UTF_16LE { + buf[0] = 0xff; + buf[1] = 0xfe; + 2 + } else { + 0 + } +} + // The documentation and implementation of this function should be up-to-date with // its sibling function, `to_writer()`. // /// Decodes a stream of bytes into UTF-8, returning a `Rope` and the -/// encoding it was decoded as. The optional `encoding` parameter can -/// be used to override encoding auto-detection. +/// encoding it was decoded as with BOM information. The optional `encoding` +/// parameter can be used to override encoding auto-detection. pub fn from_reader( reader: &mut R, - encoding: Option<&'static encoding::Encoding>, -) -> Result<(Rope, &'static encoding::Encoding), Error> { + encoding: Option<&'static Encoding>, +) -> Result<(Rope, &'static Encoding, bool), Error> { // These two buffers are 8192 bytes in size each and are used as // intermediaries during the decoding process. Text read into `buf` // from `reader` is decoded into `buf_out` as UTF-8. Once either @@ -296,25 +386,32 @@ pub fn from_reader( let mut buf_out = [0u8; BUF_SIZE]; let mut builder = RopeBuilder::new(); - // By default, the encoding of the text is auto-detected via the - // `chardetng` crate which requires sample data from the reader. + // By default, the encoding of the text is auto-detected by + // `encoding_rs` for_bom, and if it fails, from `chardetng` + // crate which requires sample data from the reader. // As a manual override to this auto-detection is possible, the // same data is read into `buf` to ensure symmetry in the upcoming // loop. - let (encoding, mut decoder, mut slice, mut is_empty) = { + let (encoding, has_bom, mut decoder, mut slice, mut is_empty) = { let read = reader.read(&mut buf)?; let is_empty = read == 0; - let encoding = encoding.unwrap_or_else(|| { - let mut encoding_detector = chardetng::EncodingDetector::new(); - encoding_detector.feed(&buf, is_empty); - encoding_detector.guess(None, true) - }); + let (encoding, has_bom) = encoding + .map(|encoding| (encoding, false)) + .or_else(|| { + encoding::Encoding::for_bom(&buf).map(|(encoding, _bom_size)| (encoding, true)) + }) + .unwrap_or_else(|| { + let mut encoding_detector = chardetng::EncodingDetector::new(); + encoding_detector.feed(&buf, is_empty); + (encoding_detector.guess(None, true), false) + }); + let decoder = encoding.new_decoder(); // If the amount of bytes read from the reader is less than // `buf.len()`, it is undesirable to read the bytes afterwards. let slice = &buf[..read]; - (encoding, decoder, slice, is_empty) + (encoding, has_bom, decoder, slice, is_empty) }; // `RopeBuilder::append()` expects a `&str`, so this is the "real" @@ -382,7 +479,7 @@ pub fn from_reader( is_empty = read == 0; } let rope = builder.finish(); - Ok((rope, encoding)) + Ok((rope, encoding, has_bom)) } // The documentation and implementation of this function should be up-to-date with @@ -393,7 +490,7 @@ pub fn from_reader( /// replacement characters may appear in the encoded text. pub async fn to_writer<'a, W: tokio::io::AsyncWriteExt + Unpin + ?Sized>( writer: &'a mut W, - encoding: &'static encoding::Encoding, + encoding_with_bom_info: (&'static Encoding, bool), rope: &'a Rope, ) -> Result<(), Error> { // Text inside a `Rope` is stored as non-contiguous blocks of data called @@ -402,13 +499,22 @@ pub async fn to_writer<'a, W: tokio::io::AsyncWriteExt + Unpin + ?Sized>( // determined by filtering the iterator to remove all empty chunks and then // appending an empty chunk to it. This is valuable for detecting when all // chunks in the `Rope` have been iterated over in the subsequent loop. + let (encoding, has_bom) = encoding_with_bom_info; + let iter = rope .chunks() .filter(|c| !c.is_empty()) .chain(std::iter::once("")); let mut buf = [0u8; BUF_SIZE]; - let mut encoder = encoding.new_encoder(); - let mut total_written = 0usize; + + let mut total_written = if has_bom { + apply_bom(encoding, &mut buf) + } else { + 0 + }; + + let mut encoder = Encoder::from_encoding(encoding); + for chunk in iter { let is_empty = chunk.is_empty(); let mut total_read = 0usize; @@ -449,6 +555,7 @@ pub async fn to_writer<'a, W: tokio::io::AsyncWriteExt + Unpin + ?Sized>( break; } } + Ok(()) } @@ -466,10 +573,10 @@ use url::Url; impl Document { pub fn from( text: Rope, - encoding: Option<&'static encoding::Encoding>, + encoding_with_bom_info: Option<(&'static Encoding, bool)>, config: Arc>, ) -> Self { - let encoding = encoding.unwrap_or(encoding::UTF_8); + let (encoding, has_bom) = encoding_with_bom_info.unwrap_or((encoding::UTF_8, false)); let changes = ChangeSet::new(&text); let old_state = None; @@ -477,6 +584,7 @@ impl Document { id: DocumentId::default(), path: None, encoding, + has_bom, text, selections: HashMap::default(), inlay_hints: HashMap::default(), @@ -511,21 +619,21 @@ impl Document { /// overwritten with the `encoding` parameter. pub fn open( path: &Path, - encoding: Option<&'static encoding::Encoding>, + encoding: Option<&'static Encoding>, config_loader: Option>, config: Arc>, ) -> Result { // Open the file if it exists, otherwise assume it is a new file (and thus empty). - let (rope, encoding) = if path.exists() { + let (rope, encoding, has_bom) = if path.exists() { let mut file = std::fs::File::open(path).context(format!("unable to open {:?}", path))?; from_reader(&mut file, encoding)? } else { let encoding = encoding.unwrap_or(encoding::UTF_8); - (Rope::from(DEFAULT_LINE_ENDING.as_str()), encoding) + (Rope::from(DEFAULT_LINE_ENDING.as_str()), encoding, false) }; - let mut doc = Self::from(rope, Some(encoding), config); + let mut doc = Self::from(rope, Some((encoding, has_bom)), config); // set the path and try detecting the language doc.set_path(Some(path))?; @@ -576,7 +684,7 @@ impl Document { })?; { let mut stdin = process.stdin.take().ok_or(FormatterError::BrokenStdin)?; - to_writer(&mut stdin, encoding::UTF_8, &text) + to_writer(&mut stdin, (encoding::UTF_8, false), &text) .await .map_err(|_| FormatterError::BrokenStdin)?; } @@ -688,8 +796,7 @@ impl Document { let current_rev = self.get_current_revision(); let doc_id = self.id(); - let encoding = self.encoding; - + let encoding_with_bom_info = (self.encoding, self.has_bom); let last_saved_time = self.last_saved_time; // We encode the file according to the `Document`'s encoding. @@ -718,7 +825,7 @@ impl Document { } let mut file = File::create(&path).await?; - to_writer(&mut file, encoding, &text).await?; + to_writer(&mut file, encoding_with_bom_info, &text).await?; let event = DocumentSavedEvent { revision: current_rev, @@ -776,7 +883,7 @@ impl Document { provider_registry: &DiffProviderRegistry, redraw_handle: RedrawHandle, ) -> Result<(), Error> { - let encoding = &self.encoding; + let encoding = self.encoding; let path = self .path() .filter(|path| path.exists()) @@ -810,13 +917,16 @@ impl Document { /// Sets the [`Document`]'s encoding with the encoding correspondent to `label`. pub fn set_encoding(&mut self, label: &str) -> Result<(), Error> { - self.encoding = encoding::Encoding::for_label(label.as_bytes()) - .ok_or_else(|| anyhow!("unknown encoding"))?; + let encoding = + Encoding::for_label(label.as_bytes()).ok_or_else(|| anyhow!("unknown encoding"))?; + + self.encoding = encoding; + Ok(()) } /// Returns the [`Document`]'s current encoding. - pub fn encoding(&self) -> &'static encoding::Encoding { + pub fn encoding(&self) -> &'static Encoding { self.encoding } @@ -1280,7 +1390,7 @@ impl Document { /// Intialize/updates the differ for this document with a new base. pub fn set_diff_base(&mut self, diff_base: Vec, redraw_handle: RedrawHandle) { - if let Ok((diff_base, _)) = from_reader(&mut diff_base.as_slice(), Some(self.encoding)) { + if let Ok((diff_base, ..)) = from_reader(&mut diff_base.as_slice(), Some(self.encoding)) { if let Some(differ) = &self.diff_handle { differ.update_diff_base(diff_base); return; @@ -1724,7 +1834,7 @@ mod test { assert!(ref_path.exists()); let mut file = std::fs::File::open(path).unwrap(); - let text = from_reader(&mut file, Some(encoding)) + let text = from_reader(&mut file, Some(encoding.into())) .unwrap() .0 .to_string(); @@ -1750,7 +1860,7 @@ mod test { let text = Rope::from_str(&std::fs::read_to_string(path).unwrap()); let mut buf: Vec = Vec::new(); - helix_lsp::block_on(to_writer(&mut buf, encoding, &text)).unwrap(); + helix_lsp::block_on(to_writer(&mut buf, (encoding, false), &text)).unwrap(); let expectation = std::fs::read(ref_path).unwrap(); assert_eq!(buf, expectation); diff --git a/helix-view/src/editor.rs b/helix-view/src/editor.rs index 005c6667..8e4dab41 100644 --- a/helix-view/src/editor.rs +++ b/helix-view/src/editor.rs @@ -1305,10 +1305,10 @@ impl Editor { } pub fn new_file_from_stdin(&mut self, action: Action) -> Result { - let (rope, encoding) = crate::document::from_reader(&mut stdin(), None)?; + let (rope, encoding, has_bom) = crate::document::from_reader(&mut stdin(), None)?; Ok(self.new_file_from_document( action, - Document::from(rope, Some(encoding), self.config.clone()), + Document::from(rope, Some((encoding, has_bom)), self.config.clone()), )) } -- cgit v1.2.3-70-g09d2