aboutsummaryrefslogtreecommitdiff
path: root/helix-view/src
diff options
context:
space:
mode:
authorAlexis-Lapierre2023-04-30 22:40:06 +0000
committerGitHub2023-04-30 22:40:06 +0000
commitb0b3f45b80931e73eadaf7e73f1981283b8e49fc (patch)
tree77329232f1ca98e5eaa131f471ea25b23e5eb0d5 /helix-view/src
parentefd09b6c7ccf8cdfde5856fd9db0d9b29ea5bd81 (diff)
Conserve BOM and properly support UTF16 (#6497)
Diffstat (limited to 'helix-view/src')
-rw-r--r--helix-view/src/document.rs178
-rw-r--r--helix-view/src/editor.rs4
2 files changed, 146 insertions, 36 deletions
diff --git a/helix-view/src/document.rs b/helix-view/src/document.rs
index 5ede5bc6..11b9ef0e 100644
--- a/helix-view/src/document.rs
+++ b/helix-view/src/document.rs
@@ -5,6 +5,7 @@ use futures_util::future::BoxFuture;
use futures_util::FutureExt;
use helix_core::auto_pairs::AutoPairs;
use helix_core::doc_formatter::TextFormat;
+use helix_core::encoding::Encoding;
use helix_core::syntax::Highlight;
use helix_core::text_annotations::{InlineAnnotation, TextAnnotations};
use helix_core::Range;
@@ -130,6 +131,7 @@ pub struct Document {
path: Option<PathBuf>,
encoding: &'static encoding::Encoding,
+ has_bom: bool,
pub restore_cursor: bool,
@@ -277,16 +279,104 @@ impl fmt::Debug for DocumentInlayHintsId {
}
}
+enum Encoder {
+ Utf16Be,
+ Utf16Le,
+ EncodingRs(encoding::Encoder),
+}
+
+impl Encoder {
+ fn from_encoding(encoding: &'static encoding::Encoding) -> Self {
+ if encoding == encoding::UTF_16BE {
+ Self::Utf16Be
+ } else if encoding == encoding::UTF_16LE {
+ Self::Utf16Le
+ } else {
+ Self::EncodingRs(encoding.new_encoder())
+ }
+ }
+
+ fn encode_from_utf8(
+ &mut self,
+ src: &str,
+ dst: &mut [u8],
+ is_empty: bool,
+ ) -> (encoding::CoderResult, usize, usize) {
+ if src.is_empty() {
+ return (encoding::CoderResult::InputEmpty, 0, 0);
+ }
+ let mut write_to_buf = |convert: fn(u16) -> [u8; 2]| {
+ let to_write = src.char_indices().map(|(indice, char)| {
+ let mut encoded: [u16; 2] = [0, 0];
+ (
+ indice,
+ char.encode_utf16(&mut encoded)
+ .iter_mut()
+ .flat_map(|char| convert(*char))
+ .collect::<Vec<u8>>(),
+ )
+ });
+
+ let mut total_written = 0usize;
+
+ for (indice, utf16_bytes) in to_write {
+ let character_size = utf16_bytes.len();
+
+ if dst.len() <= (total_written + character_size) {
+ return (encoding::CoderResult::OutputFull, indice, total_written);
+ }
+
+ for character in utf16_bytes {
+ dst[total_written] = character;
+ total_written += 1;
+ }
+ }
+
+ (encoding::CoderResult::InputEmpty, src.len(), total_written)
+ };
+
+ match self {
+ Self::Utf16Be => write_to_buf(u16::to_be_bytes),
+ Self::Utf16Le => write_to_buf(u16::to_le_bytes),
+ Self::EncodingRs(encoder) => {
+ let (code_result, read, written, ..) = encoder.encode_from_utf8(src, dst, is_empty);
+
+ (code_result, read, written)
+ }
+ }
+ }
+}
+
+// Apply BOM if encoding permit it, return the number of bytes written at the start of buf
+fn apply_bom(encoding: &'static encoding::Encoding, buf: &mut [u8; BUF_SIZE]) -> usize {
+ if encoding == encoding::UTF_8 {
+ buf[0] = 0xef;
+ buf[1] = 0xbb;
+ buf[2] = 0xbf;
+ 3
+ } else if encoding == encoding::UTF_16BE {
+ buf[0] = 0xfe;
+ buf[1] = 0xff;
+ 2
+ } else if encoding == encoding::UTF_16LE {
+ buf[0] = 0xff;
+ buf[1] = 0xfe;
+ 2
+ } else {
+ 0
+ }
+}
+
// The documentation and implementation of this function should be up-to-date with
// its sibling function, `to_writer()`.
//
/// Decodes a stream of bytes into UTF-8, returning a `Rope` and the
-/// encoding it was decoded as. The optional `encoding` parameter can
-/// be used to override encoding auto-detection.
+/// encoding it was decoded as with BOM information. The optional `encoding`
+/// parameter can be used to override encoding auto-detection.
pub fn from_reader<R: std::io::Read + ?Sized>(
reader: &mut R,
- encoding: Option<&'static encoding::Encoding>,
-) -> Result<(Rope, &'static encoding::Encoding), Error> {
+ encoding: Option<&'static Encoding>,
+) -> Result<(Rope, &'static Encoding, bool), Error> {
// These two buffers are 8192 bytes in size each and are used as
// intermediaries during the decoding process. Text read into `buf`
// from `reader` is decoded into `buf_out` as UTF-8. Once either
@@ -296,25 +386,32 @@ pub fn from_reader<R: std::io::Read + ?Sized>(
let mut buf_out = [0u8; BUF_SIZE];
let mut builder = RopeBuilder::new();
- // By default, the encoding of the text is auto-detected via the
- // `chardetng` crate which requires sample data from the reader.
+ // By default, the encoding of the text is auto-detected by
+ // `encoding_rs` for_bom, and if it fails, from `chardetng`
+ // crate which requires sample data from the reader.
// As a manual override to this auto-detection is possible, the
// same data is read into `buf` to ensure symmetry in the upcoming
// loop.
- let (encoding, mut decoder, mut slice, mut is_empty) = {
+ let (encoding, has_bom, mut decoder, mut slice, mut is_empty) = {
let read = reader.read(&mut buf)?;
let is_empty = read == 0;
- let encoding = encoding.unwrap_or_else(|| {
- let mut encoding_detector = chardetng::EncodingDetector::new();
- encoding_detector.feed(&buf, is_empty);
- encoding_detector.guess(None, true)
- });
+ let (encoding, has_bom) = encoding
+ .map(|encoding| (encoding, false))
+ .or_else(|| {
+ encoding::Encoding::for_bom(&buf).map(|(encoding, _bom_size)| (encoding, true))
+ })
+ .unwrap_or_else(|| {
+ let mut encoding_detector = chardetng::EncodingDetector::new();
+ encoding_detector.feed(&buf, is_empty);
+ (encoding_detector.guess(None, true), false)
+ });
+
let decoder = encoding.new_decoder();
// If the amount of bytes read from the reader is less than
// `buf.len()`, it is undesirable to read the bytes afterwards.
let slice = &buf[..read];
- (encoding, decoder, slice, is_empty)
+ (encoding, has_bom, decoder, slice, is_empty)
};
// `RopeBuilder::append()` expects a `&str`, so this is the "real"
@@ -382,7 +479,7 @@ pub fn from_reader<R: std::io::Read + ?Sized>(
is_empty = read == 0;
}
let rope = builder.finish();
- Ok((rope, encoding))
+ Ok((rope, encoding, has_bom))
}
// The documentation and implementation of this function should be up-to-date with
@@ -393,7 +490,7 @@ pub fn from_reader<R: std::io::Read + ?Sized>(
/// replacement characters may appear in the encoded text.
pub async fn to_writer<'a, W: tokio::io::AsyncWriteExt + Unpin + ?Sized>(
writer: &'a mut W,
- encoding: &'static encoding::Encoding,
+ encoding_with_bom_info: (&'static Encoding, bool),
rope: &'a Rope,
) -> Result<(), Error> {
// Text inside a `Rope` is stored as non-contiguous blocks of data called
@@ -402,13 +499,22 @@ pub async fn to_writer<'a, W: tokio::io::AsyncWriteExt + Unpin + ?Sized>(
// determined by filtering the iterator to remove all empty chunks and then
// appending an empty chunk to it. This is valuable for detecting when all
// chunks in the `Rope` have been iterated over in the subsequent loop.
+ let (encoding, has_bom) = encoding_with_bom_info;
+
let iter = rope
.chunks()
.filter(|c| !c.is_empty())
.chain(std::iter::once(""));
let mut buf = [0u8; BUF_SIZE];
- let mut encoder = encoding.new_encoder();
- let mut total_written = 0usize;
+
+ let mut total_written = if has_bom {
+ apply_bom(encoding, &mut buf)
+ } else {
+ 0
+ };
+
+ let mut encoder = Encoder::from_encoding(encoding);
+
for chunk in iter {
let is_empty = chunk.is_empty();
let mut total_read = 0usize;
@@ -449,6 +555,7 @@ pub async fn to_writer<'a, W: tokio::io::AsyncWriteExt + Unpin + ?Sized>(
break;
}
}
+
Ok(())
}
@@ -466,10 +573,10 @@ use url::Url;
impl Document {
pub fn from(
text: Rope,
- encoding: Option<&'static encoding::Encoding>,
+ encoding_with_bom_info: Option<(&'static Encoding, bool)>,
config: Arc<dyn DynAccess<Config>>,
) -> Self {
- let encoding = encoding.unwrap_or(encoding::UTF_8);
+ let (encoding, has_bom) = encoding_with_bom_info.unwrap_or((encoding::UTF_8, false));
let changes = ChangeSet::new(&text);
let old_state = None;
@@ -477,6 +584,7 @@ impl Document {
id: DocumentId::default(),
path: None,
encoding,
+ has_bom,
text,
selections: HashMap::default(),
inlay_hints: HashMap::default(),
@@ -511,21 +619,21 @@ impl Document {
/// overwritten with the `encoding` parameter.
pub fn open(
path: &Path,
- encoding: Option<&'static encoding::Encoding>,
+ encoding: Option<&'static Encoding>,
config_loader: Option<Arc<syntax::Loader>>,
config: Arc<dyn DynAccess<Config>>,
) -> Result<Self, Error> {
// Open the file if it exists, otherwise assume it is a new file (and thus empty).
- let (rope, encoding) = if path.exists() {
+ let (rope, encoding, has_bom) = if path.exists() {
let mut file =
std::fs::File::open(path).context(format!("unable to open {:?}", path))?;
from_reader(&mut file, encoding)?
} else {
let encoding = encoding.unwrap_or(encoding::UTF_8);
- (Rope::from(DEFAULT_LINE_ENDING.as_str()), encoding)
+ (Rope::from(DEFAULT_LINE_ENDING.as_str()), encoding, false)
};
- let mut doc = Self::from(rope, Some(encoding), config);
+ let mut doc = Self::from(rope, Some((encoding, has_bom)), config);
// set the path and try detecting the language
doc.set_path(Some(path))?;
@@ -576,7 +684,7 @@ impl Document {
})?;
{
let mut stdin = process.stdin.take().ok_or(FormatterError::BrokenStdin)?;
- to_writer(&mut stdin, encoding::UTF_8, &text)
+ to_writer(&mut stdin, (encoding::UTF_8, false), &text)
.await
.map_err(|_| FormatterError::BrokenStdin)?;
}
@@ -688,8 +796,7 @@ impl Document {
let current_rev = self.get_current_revision();
let doc_id = self.id();
- let encoding = self.encoding;
-
+ let encoding_with_bom_info = (self.encoding, self.has_bom);
let last_saved_time = self.last_saved_time;
// We encode the file according to the `Document`'s encoding.
@@ -718,7 +825,7 @@ impl Document {
}
let mut file = File::create(&path).await?;
- to_writer(&mut file, encoding, &text).await?;
+ to_writer(&mut file, encoding_with_bom_info, &text).await?;
let event = DocumentSavedEvent {
revision: current_rev,
@@ -776,7 +883,7 @@ impl Document {
provider_registry: &DiffProviderRegistry,
redraw_handle: RedrawHandle,
) -> Result<(), Error> {
- let encoding = &self.encoding;
+ let encoding = self.encoding;
let path = self
.path()
.filter(|path| path.exists())
@@ -810,13 +917,16 @@ impl Document {
/// Sets the [`Document`]'s encoding with the encoding correspondent to `label`.
pub fn set_encoding(&mut self, label: &str) -> Result<(), Error> {
- self.encoding = encoding::Encoding::for_label(label.as_bytes())
- .ok_or_else(|| anyhow!("unknown encoding"))?;
+ let encoding =
+ Encoding::for_label(label.as_bytes()).ok_or_else(|| anyhow!("unknown encoding"))?;
+
+ self.encoding = encoding;
+
Ok(())
}
/// Returns the [`Document`]'s current encoding.
- pub fn encoding(&self) -> &'static encoding::Encoding {
+ pub fn encoding(&self) -> &'static Encoding {
self.encoding
}
@@ -1280,7 +1390,7 @@ impl Document {
/// Intialize/updates the differ for this document with a new base.
pub fn set_diff_base(&mut self, diff_base: Vec<u8>, redraw_handle: RedrawHandle) {
- if let Ok((diff_base, _)) = from_reader(&mut diff_base.as_slice(), Some(self.encoding)) {
+ if let Ok((diff_base, ..)) = from_reader(&mut diff_base.as_slice(), Some(self.encoding)) {
if let Some(differ) = &self.diff_handle {
differ.update_diff_base(diff_base);
return;
@@ -1724,7 +1834,7 @@ mod test {
assert!(ref_path.exists());
let mut file = std::fs::File::open(path).unwrap();
- let text = from_reader(&mut file, Some(encoding))
+ let text = from_reader(&mut file, Some(encoding.into()))
.unwrap()
.0
.to_string();
@@ -1750,7 +1860,7 @@ mod test {
let text = Rope::from_str(&std::fs::read_to_string(path).unwrap());
let mut buf: Vec<u8> = Vec::new();
- helix_lsp::block_on(to_writer(&mut buf, encoding, &text)).unwrap();
+ helix_lsp::block_on(to_writer(&mut buf, (encoding, false), &text)).unwrap();
let expectation = std::fs::read(ref_path).unwrap();
assert_eq!(buf, expectation);
diff --git a/helix-view/src/editor.rs b/helix-view/src/editor.rs
index 005c6667..8e4dab41 100644
--- a/helix-view/src/editor.rs
+++ b/helix-view/src/editor.rs
@@ -1305,10 +1305,10 @@ impl Editor {
}
pub fn new_file_from_stdin(&mut self, action: Action) -> Result<DocumentId, Error> {
- let (rope, encoding) = crate::document::from_reader(&mut stdin(), None)?;
+ let (rope, encoding, has_bom) = crate::document::from_reader(&mut stdin(), None)?;
Ok(self.new_file_from_document(
action,
- Document::from(rope, Some(encoding), self.config.clone()),
+ Document::from(rope, Some((encoding, has_bom)), self.config.clone()),
))
}