Capture word parts while calculating shellwords (#4632)

This fixes an edge case for completing shellwords. With a file "a b.txt" in the current directory, the sequence `:open a\<tab>` will result in the prompt containing `:open aa\ b.txt`. This is because the length of the input which is trimmed when replacing with completion is calculated on the part of the input which is parsed by shellwords and then escaped (in a separate operation), which is lossy. In this case it loses the trailing backslash. The fix provided here refactors shellwords to track both the _words_ (shellwords with quotes and escapes resolved) and the _parts_ (chunks of the input which turned into each word, with separating whitespace removed). When calculating how much of the input to delete when replacing with the completion item, we now use the length of the last part. This also allows us to eliminate the duplicate work done in the `ends_with_whitespace` check.
author: Michael Davis 2022-11-17 01:00:48 +0000
committer: GitHub 2022-11-17 01:00:48 +0000
commit: c6b83368b3b626cb63120b5ac684cc8b1b693172 (patch)
tree: f14721ce897e1e525e63c384b2fa46fbef141ae4 /helix-core/src
parent: b474ee1843d5cb7cb5291bee4166490a223e5aac (diff)
1 files changed, 171 insertions, 160 deletions
diff --git a/helix-core/src/shellwords.rs b/helix-core/src/shellwords.rs
index 7742896c..9475f5e5 100644
--- a/helix-core/src/shellwords.rs
+++ b/helix-core/src/shellwords.rs
@@ -27,181 +27,172 @@ enum State {
     DquoteEscaped,
 }
 
-/// Get the vec of escaped / quoted / doublequoted filenames from the input str
-pub fn shellwords(input: &str) -> Vec<Cow<'_, str>> {
-    use State::*;
+pub struct Shellwords<'a> {
+    state: State,
+    /// Shellwords where whitespace and escapes has been resolved.
+    words: Vec<Cow<'a, str>>,
+    /// The parts of the input that are divided into shellwords. This can be
+    /// used to retrieve the original text for a given word by looking up the
+    /// same index in the Vec as the word in `words`.
+    parts: Vec<&'a str>,
+}
 
-    let mut state = Unquoted;
-    let mut args: Vec<Cow<str>> = Vec::new();
-    let mut escaped = String::with_capacity(input.len());
+impl<'a> From<&'a str> for Shellwords<'a> {
+    fn from(input: &'a str) -> Self {
+        use State::*;
 
-    let mut start = 0;
-    let mut end = 0;
+        let mut state = Unquoted;
+        let mut words = Vec::new();
+        let mut parts = Vec::new();
+        let mut escaped = String::with_capacity(input.len());
 
-    for (i, c) in input.char_indices() {
-        state = match state {
-            OnWhitespace => match c {
-                '"' => {
-                    end = i;
-                    Dquoted
-                }
-                '\'' => {
-                    end = i;
-                    Quoted
-                }
-                '\\' => {
-                    if cfg!(unix) {
-                        escaped.push_str(&input[start..i]);
-                        start = i + 1;
-                        UnquotedEscaped
-                    } else {
+        let mut part_start = 0;
+        let mut unescaped_start = 0;
+        let mut end = 0;
+
+        for (i, c) in input.char_indices() {
+            state = match state {
+                OnWhitespace => match c {
+                    '"' => {
+                        end = i;
+                        Dquoted
+                    }
+                    '\'' => {
+                        end = i;
+                        Quoted
+                    }
+                    '\\' => {
+                        if cfg!(unix) {
+                            escaped.push_str(&input[unescaped_start..i]);
+                            unescaped_start = i + 1;
+                            UnquotedEscaped
+                        } else {
+                            OnWhitespace
+                        }
+                    }
+                    c if c.is_ascii_whitespace() => {
+                        end = i;
                         OnWhitespace
                     }
-                }
-                c if c.is_ascii_whitespace() => {
-                    end = i;
-                    OnWhitespace
-                }
-                _ => Unquoted,
-            },
-            Unquoted => match c {
-                '\\' => {
-                    if cfg!(unix) {
-                        escaped.push_str(&input[start..i]);
-                        start = i + 1;
-                        UnquotedEscaped
-                    } else {
-                        Unquoted
+                    _ => Unquoted,
+                },
+                Unquoted => match c {
+                    '\\' => {
+                        if cfg!(unix) {
+                            escaped.push_str(&input[unescaped_start..i]);
+                            unescaped_start = i + 1;
+                            UnquotedEscaped
+                        } else {
+                            Unquoted
+                        }
                     }
-                }
-                c if c.is_ascii_whitespace() => {
-                    end = i;
-                    OnWhitespace
-                }
-                _ => Unquoted,
-            },
-            UnquotedEscaped => Unquoted,
-            Quoted => match c {
-                '\\' => {
-                    if cfg!(unix) {
-                        escaped.push_str(&input[start..i]);
-                        start = i + 1;
-                        QuoteEscaped
-                    } else {
-                        Quoted
+                    c if c.is_ascii_whitespace() => {
+                        end = i;
+                        OnWhitespace
                     }
-                }
-                '\'' => {
-                    end = i;
-                    OnWhitespace
-                }
-                _ => Quoted,
-            },
-            QuoteEscaped => Quoted,
-            Dquoted => match c {
-                '\\' => {
-                    if cfg!(unix) {
-                        escaped.push_str(&input[start..i]);
-                        start = i + 1;
-                        DquoteEscaped
-                    } else {
-                        Dquoted
+                    _ => Unquoted,
+                },
+                UnquotedEscaped => Unquoted,
+                Quoted => match c {
+                    '\\' => {
+                        if cfg!(unix) {
+                            escaped.push_str(&input[unescaped_start..i]);
+                            unescaped_start = i + 1;
+                            QuoteEscaped
+                        } else {
+                            Quoted
+                        }
                     }
-                }
-                '"' => {
-                    end = i;
-                    OnWhitespace
-                }
-                _ => Dquoted,
-            },
-            DquoteEscaped => Dquoted,
-        };
+                    '\'' => {
+                        end = i;
+                        OnWhitespace
+                    }
+                    _ => Quoted,
+                },
+                QuoteEscaped => Quoted,
+                Dquoted => match c {
+                    '\\' => {
+                        if cfg!(unix) {
+                            escaped.push_str(&input[unescaped_start..i]);
+                            unescaped_start = i + 1;
+                            DquoteEscaped
+                        } else {
+                            Dquoted
+                        }
+                    }
+                    '"' => {
+                        end = i;
+                        OnWhitespace
+                    }
+                    _ => Dquoted,
+                },
+                DquoteEscaped => Dquoted,
+            };
 
-        if i >= input.len() - 1 && end == 0 {
-            end = i + 1;
-        }
+            if i >= input.len() - 1 && end == 0 {
+                end = i + 1;
+            }
 
-        if end > 0 {
-            let esc_trim = escaped.trim();
-            let inp = &input[start..end];
+            if end > 0 {
+                let esc_trim = escaped.trim();
+                let inp = &input[unescaped_start..end];
 
-            if !(esc_trim.is_empty() && inp.trim().is_empty()) {
-                if esc_trim.is_empty() {
-                    args.push(inp.into());
-                } else {
-                    args.push([escaped, inp.into()].concat().into());
-                    escaped = "".to_string();
+                if !(esc_trim.is_empty() && inp.trim().is_empty()) {
+                    if esc_trim.is_empty() {
+                        words.push(inp.into());
+                        parts.push(inp);
+                    } else {
+                        words.push([escaped, inp.into()].concat().into());
+                        parts.push(&input[part_start..end]);
+                        escaped = "".to_string();
+                    }
                 }
+                unescaped_start = i + 1;
+                part_start = i + 1;
+                end = 0;
             }
-            start = i + 1;
-            end = 0;
         }
-    }
-    args
-}
 
-/// Checks that the input ends with an ascii whitespace character which is
-/// not escaped.
-///
-/// # Examples
-///
-/// ```rust
-/// use helix_core::shellwords::ends_with_whitespace;
-/// assert_eq!(ends_with_whitespace(" "), true);
-/// assert_eq!(ends_with_whitespace(":open "), true);
-/// assert_eq!(ends_with_whitespace(":open foo.txt "), true);
-/// assert_eq!(ends_with_whitespace(":open"), false);
-/// #[cfg(unix)]
-/// assert_eq!(ends_with_whitespace(":open a\\ "), false);
-/// #[cfg(unix)]
-/// assert_eq!(ends_with_whitespace(":open a\\ b.txt"), false);
-/// ```
-pub fn ends_with_whitespace(input: &str) -> bool {
-    use State::*;
+        debug_assert!(words.len() == parts.len());
 
-    // Fast-lane: the input must end with a whitespace character
-    // regardless of quoting.
-    if !input.ends_with(|c: char| c.is_ascii_whitespace()) {
-        return false;
+        Self {
+            state,
+            words,
+            parts,
+        }
     }
+}
 
-    let mut state = Unquoted;
+impl<'a> Shellwords<'a> {
+    /// Checks that the input ends with a whitespace character which is not escaped.
+    ///
+    /// # Examples
+    ///
+    /// ```rust
+    /// use helix_core::shellwords::Shellwords;
+    /// assert_eq!(Shellwords::from(" ").ends_with_whitespace(), true);
+    /// assert_eq!(Shellwords::from(":open ").ends_with_whitespace(), true);
+    /// assert_eq!(Shellwords::from(":open foo.txt ").ends_with_whitespace(), true);
+    /// assert_eq!(Shellwords::from(":open").ends_with_whitespace(), false);
+    /// #[cfg(unix)]
+    /// assert_eq!(Shellwords::from(":open a\\ ").ends_with_whitespace(), false);
+    /// #[cfg(unix)]
+    /// assert_eq!(Shellwords::from(":open a\\ b.txt").ends_with_whitespace(), false);
+    /// ```
+    pub fn ends_with_whitespace(&self) -> bool {
+        matches!(self.state, State::OnWhitespace)
+    }
 
-    for c in input.chars() {
-        state = match state {
-            OnWhitespace => match c {
-                '"' => Dquoted,
-                '\'' => Quoted,
-                '\\' if cfg!(unix) => UnquotedEscaped,
-                '\\' => OnWhitespace,
-                c if c.is_ascii_whitespace() => OnWhitespace,
-                _ => Unquoted,
-            },
-            Unquoted => match c {
-                '\\' if cfg!(unix) => UnquotedEscaped,
-                '\\' => Unquoted,
-                c if c.is_ascii_whitespace() => OnWhitespace,
-                _ => Unquoted,
-            },
-            UnquotedEscaped => Unquoted,
-            Quoted => match c {
-                '\\' if cfg!(unix) => QuoteEscaped,
-                '\\' => Quoted,
-                '\'' => OnWhitespace,
-                _ => Quoted,
-            },
-            QuoteEscaped => Quoted,
-            Dquoted => match c {
-                '\\' if cfg!(unix) => DquoteEscaped,
-                '\\' => Dquoted,
-                '"' => OnWhitespace,
-                _ => Dquoted,
-            },
-            DquoteEscaped => Dquoted,
-        }
+    /// Returns the list of shellwords calculated from the input string.
+    pub fn words(&self) -> &[Cow<'a, str>] {
+        &self.words
     }
 
-    matches!(state, OnWhitespace)
+    /// Returns a list of strings which correspond to [`Self::words`] but represent the original
+    /// text in the input string - including escape characters - without separating whitespace.
+    pub fn parts(&self) -> &[&'a str] {
+        &self.parts
+    }
 }
 
 #[cfg(test)]
@@ -212,7 +203,8 @@ mod test {
     #[cfg(windows)]
     fn test_normal() {
         let input = r#":o single_word twó wörds \three\ \"with\ escaping\\"#;
-        let result = shellwords(input);
+        let shellwords = Shellwords::from(input);
+        let result = shellwords.words().to_vec();
         let expected = vec![
             Cow::from(":o"),
             Cow::from("single_word"),
@@ -230,7 +222,8 @@ mod test {
     #[cfg(unix)]
     fn test_normal() {
         let input = r#":o single_word twó wörds \three\ \"with\ escaping\\"#;
-        let result = shellwords(input);
+        let shellwords = Shellwords::from(input);
+        let result = shellwords.words().to_vec();
         let expected = vec![
             Cow::from(":o"),
             Cow::from("single_word"),
@@ -247,7 +240,8 @@ mod test {
     fn test_quoted() {
         let quoted =
             r#":o 'single_word' 'twó wörds' '' ' ''\three\' \"with\ escaping\\' 'quote incomplete"#;
-        let result = shellwords(quoted);
+        let shellwords = Shellwords::from(quoted);
+        let result = shellwords.words().to_vec();
         let expected = vec![
             Cow::from(":o"),
             Cow::from("single_word"),
@@ -262,7 +256,8 @@ mod test {
     #[cfg(unix)]
     fn test_dquoted() {
         let dquoted = r#":o "single_word" "twó wörds" "" "  ""\three\' \"with\ escaping\\" "dquote incomplete"#;
-        let result = shellwords(dquoted);
+        let shellwords = Shellwords::from(dquoted);
+        let result = shellwords.words().to_vec();
         let expected = vec![
             Cow::from(":o"),
             Cow::from("single_word"),
@@ -277,7 +272,8 @@ mod test {
     #[cfg(unix)]
     fn test_mixed() {
         let dquoted = r#":o single_word 'twó wörds' "\three\' \"with\ escaping\\""no space before"'and after' $#%^@ "%^&(%^" ')(*&^%''a\\\\\b' '"#;
-        let result = shellwords(dquoted);
+        let shellwords = Shellwords::from(dquoted);
+        let result = shellwords.words().to_vec();
         let expected = vec![
             Cow::from(":o"),
             Cow::from("single_word"),
@@ -298,7 +294,8 @@ mod test {
     fn test_lists() {
         let input =
             r#":set statusline.center ["file-type","file-encoding"] '["list", "in", "qoutes"]'"#;
-        let result = shellwords(input);
+        let shellwords = Shellwords::from(input);
+        let result = shellwords.words().to_vec();
         let expected = vec![
             Cow::from(":set"),
             Cow::from("statusline.center"),
@@ -322,4 +319,18 @@ mod test {
         assert_eq!(escape("foobar".into()), Cow::Borrowed("foobar"));
         assert_eq!(escape("foo bar".into()), Cow::Borrowed("\"foo bar\""));
     }
+
+    #[test]
+    #[cfg(unix)]
+    fn test_parts() {
+        assert_eq!(Shellwords::from(":o a").parts(), &[":o", "a"]);
+        assert_eq!(Shellwords::from(":o a\\ ").parts(), &[":o", "a\\ "]);
+    }
+
+    #[test]
+    #[cfg(windows)]
+    fn test_parts() {
+        assert_eq!(Shellwords::from(":o a").parts(), &[":o", "a"]);
+        assert_eq!(Shellwords::from(":o a\\ ").parts(), &[":o", "a\\"]);
+    }
 }
author	Michael Davis	2022-11-17 01:00:48 +0000
committer	GitHub	2022-11-17 01:00:48 +0000
commit	c6b83368b3b626cb63120b5ac684cc8b1b693172 (patch)
tree	f14721ce897e1e525e63c384b2fa46fbef141ae4 /helix-core/src
parent	b474ee1843d5cb7cb5291bee4166490a223e5aac (diff)