2 files changed, 95 insertions, 19 deletions
diff --git a/helix-lsp/src/client.rs b/helix-lsp/src/client.rs
index 6827f568..365c6990 100644
--- a/helix-lsp/src/client.rs
+++ b/helix-lsp/src/client.rs
@@ -104,7 +104,7 @@ impl Client {
             server_tx,
             request_counter: AtomicU64::new(0),
             capabilities: OnceCell::new(),
-            offset_encoding: OffsetEncoding::Utf8,
+            offset_encoding: OffsetEncoding::Utf16,
             config,
             req_timeout,
 
diff --git a/helix-lsp/src/lib.rs b/helix-lsp/src/lib.rs
index 8418896c..bcaff10a 100644
--- a/helix-lsp/src/lib.rs
+++ b/helix-lsp/src/lib.rs
@@ -57,6 +57,7 @@ pub enum OffsetEncoding {
 
 pub mod util {
     use super::*;
+    use helix_core::line_ending::{line_end_byte_index, line_end_char_index};
     use helix_core::{diagnostic::NumberOrString, Range, Rope, Selection, Tendril, Transaction};
 
     /// Converts a diagnostic in the document to [`lsp::Diagnostic`].
@@ -117,7 +118,7 @@ pub mod util {
 
     /// Converts [`lsp::Position`] to a position in the document.
     ///
-    /// Returns `None` if position exceeds document length or an operation overflows.
+    /// Returns `None` if position.line is out of bounds or an overflow occurs
     pub fn lsp_pos_to_pos(
         doc: &Rope,
         pos: lsp::Position,
@@ -128,22 +129,58 @@ pub mod util {
             return None;
         }
 
-        match offset_encoding {
+        // We need to be careful here to fully comply ith the LSP spec.
+        // Two relevant quotes from the spec:
+        //
+        // https://microsoft.github.io/language-server-protocol/specifications/lsp/3.17/specification/#position
+        // > If the character value is greater than the line length it defaults back
+        // >  to the line length.
+        //
+        // https://microsoft.github.io/language-server-protocol/specifications/lsp/3.17/specification/#textDocuments
+        // > To ensure that both client and server split the string into the same
+        // > line representation the protocol specifies the following end-of-line sequences:
+        // > ‘\n’, ‘\r\n’ and ‘\r’. Positions are line end character agnostic.
+        // > So you can not specify a position that denotes \r|\n or \n| where | represents the character offset.
+        //
+        // This means that while the line must be in bounds the `charater`
+        // must be capped to the end of the line.
+        // Note that the end of the line here is **before** the line terminator
+        // so we must use `line_end_char_index` istead of `doc.line_to_char(pos_line + 1)`
+        //
+        // FIXME: Helix does not fully comply with the LSP spec for line terminators.
+        // The LSP standard requires that line terminators are ['\n', '\r\n', '\r'].
+        // Without the unicode-linebreak feature disabled, the `\r` terminator is not handled by helix.
+        // With the unicode-linebreak feature, helix recognizes multiple extra line break chars
+        // which means that positions will be decoded/encoded incorrectly in their presence
+
+        let line = match offset_encoding {
             OffsetEncoding::Utf8 => {
-                let line = doc.line_to_char(pos_line);
-                let pos = line.checked_add(pos.character as usize)?;
-                if pos <= doc.len_chars() {
-                    Some(pos)
-                } else {
-                    None
-                }
+                let line_start = doc.line_to_byte(pos_line);
+                let line_end = line_end_byte_index(&doc.slice(..), pos_line);
+                line_start..line_end
             }
             OffsetEncoding::Utf16 => {
-                let line = doc.line_to_char(pos_line);
-                let line_start = doc.char_to_utf16_cu(line);
-                let pos = line_start.checked_add(pos.character as usize)?;
-                doc.try_utf16_cu_to_char(pos).ok()
+                // TODO directly translate line index to char-idx
+                // ropey can do this just as easily as utf-8 byte translation
+                // but the functions are just missing.
+                // Translate to char first and then utf-16 as a workaround
+                let line_start = doc.line_to_char(pos_line);
+                let line_end = line_end_char_index(&doc.slice(..), pos_line);
+                doc.char_to_utf16_cu(line_start)..doc.char_to_utf16_cu(line_end)
             }
+        };
+
+        // The LSP spec demands that the offset is capped to the end of the line
+        let pos = line
+            .start
+            .checked_add(pos.character as usize)
+            .unwrap_or(line.end)
+            .min(line.end);
+
+        // TODO prefer UTF32/char indices to avoid this step
+        match offset_encoding {
+            OffsetEncoding::Utf8 => doc.try_byte_to_char(pos).ok(),
+            OffsetEncoding::Utf16 => doc.try_utf16_cu_to_char(pos).ok(),
         }
     }
 
@@ -158,8 +195,8 @@ pub mod util {
         match offset_encoding {
             OffsetEncoding::Utf8 => {
                 let line = doc.char_to_line(pos);
-                let line_start = doc.line_to_char(line);
-                let col = pos - line_start;
+                let line_start = doc.line_to_byte(line);
+                let col = doc.char_to_byte(pos) - line_start;
 
                 lsp::Position::new(line as u32, col as u32)
             }
@@ -606,16 +643,55 @@ mod tests {
         }
 
         test_case!("", (0, 0) => Some(0));
-        test_case!("", (0, 1) => None);
+        test_case!("", (0, 1) => Some(0));
         test_case!("", (1, 0) => None);
         test_case!("\n\n", (0, 0) => Some(0));
         test_case!("\n\n", (1, 0) => Some(1));
-        test_case!("\n\n", (1, 1) => Some(2));
+        test_case!("\n\n", (1, 1) => Some(1));
         test_case!("\n\n", (2, 0) => Some(2));
         test_case!("\n\n", (3, 0) => None);
         test_case!("test\n\n\n\ncase", (4, 3) => Some(11));
         test_case!("test\n\n\n\ncase", (4, 4) => Some(12));
-        test_case!("test\n\n\n\ncase", (4, 5) => None);
+        test_case!("test\n\n\n\ncase", (4, 5) => Some(12));
         test_case!("", (u32::MAX, u32::MAX) => None);
     }
+
+    #[test]
+    fn emoji_format_gh_4791() {
+        use lsp_types::{Position, Range, TextEdit};
+
+        let edits = vec![
+            TextEdit {
+                range: Range {
+                    start: Position {
+                        line: 0,
+                        character: 1,
+                    },
+                    end: Position {
+                        line: 1,
+                        character: 0,
+                    },
+                },
+                new_text: "\n  ".to_string(),
+            },
+            TextEdit {
+                range: Range {
+                    start: Position {
+                        line: 1,
+                        character: 7,
+                    },
+                    end: Position {
+                        line: 2,
+                        character: 0,
+                    },
+                },
+                new_text: "\n  ".to_string(),
+            },
+        ];
+
+        let mut source = Rope::from_str("[\n\"🇺🇸\",\n\"🎄\",\n]");
+
+        let transaction = generate_transaction_from_edits(&source, edits, OffsetEncoding::Utf8);
+        assert!(transaction.apply(&mut source));
+    }
 }