aboutsummaryrefslogtreecommitdiff
path: root/helix-core/src/diff.rs
diff options
context:
space:
mode:
Diffstat (limited to 'helix-core/src/diff.rs')
-rw-r--r--helix-core/src/diff.rs248
1 files changed, 203 insertions, 45 deletions
diff --git a/helix-core/src/diff.rs b/helix-core/src/diff.rs
index 6960c679..c754da30 100644
--- a/helix-core/src/diff.rs
+++ b/helix-core/src/diff.rs
@@ -1,58 +1,195 @@
-use crate::{Rope, Transaction};
+use std::ops::Range;
+use std::time::Instant;
-/// Compares `old` and `new` to generate a [`Transaction`] describing
-/// the steps required to get from `old` to `new`.
-pub fn compare_ropes(old: &Rope, new: &Rope) -> Transaction {
- // `similar` only works on contiguous data, so a `Rope` has
- // to be temporarily converted into a `String`.
- let old_converted = old.to_string();
- let new_converted = new.to_string();
-
- // A timeout is set so after 1 seconds, the algorithm will start
- // approximating. This is especially important for big `Rope`s or
- // `Rope`s that are extremely dissimilar to each other.
- let mut config = similar::TextDiff::configure();
- config.timeout(std::time::Duration::from_secs(1));
-
- let diff = config.diff_chars(&old_converted, &new_converted);
-
- // The current position of the change needs to be tracked to
- // construct the `Change`s.
- let mut pos = 0;
- Transaction::change(
- old,
- diff.ops()
+use imara_diff::intern::InternedInput;
+use imara_diff::Algorithm;
+use ropey::RopeSlice;
+
+use crate::{ChangeSet, Rope, Tendril, Transaction};
+
+/// A `imara_diff::Sink` that builds a `ChangeSet` for a character diff of a hunk
+struct CharChangeSetBuilder<'a> {
+ res: &'a mut ChangeSet,
+ hunk: &'a InternedInput<char>,
+ pos: u32,
+}
+
+impl imara_diff::Sink for CharChangeSetBuilder<'_> {
+ type Out = ();
+ fn process_change(&mut self, before: Range<u32>, after: Range<u32>) {
+ self.res.retain((before.start - self.pos) as usize);
+ self.res.delete(before.len());
+ self.pos = before.end;
+
+ let res = self.hunk.after[after.start as usize..after.end as usize]
+ .iter()
+ .map(|&token| self.hunk.interner[token])
+ .collect();
+
+ self.res.insert(res);
+ }
+
+ fn finish(self) -> Self::Out {
+ self.res.retain(self.hunk.before.len() - self.pos as usize);
+ }
+}
+
+struct LineChangeSetBuilder<'a> {
+ res: ChangeSet,
+ after: RopeSlice<'a>,
+ file: &'a InternedInput<RopeSlice<'a>>,
+ current_hunk: InternedInput<char>,
+ pos: u32,
+}
+
+impl imara_diff::Sink for LineChangeSetBuilder<'_> {
+ type Out = ChangeSet;
+
+ fn process_change(&mut self, before: Range<u32>, after: Range<u32>) {
+ let len = self.file.before[self.pos as usize..before.start as usize]
.iter()
- .map(|op| op.as_tag_tuple())
- .filter_map(|(tag, old_range, new_range)| {
- // `old_pos..pos` is equivalent to `start..end` for where
- // the change should be applied.
- let old_pos = pos;
- pos += old_range.end - old_range.start;
-
- match tag {
- // Semantically, inserts and replacements are the same thing.
- similar::DiffTag::Insert | similar::DiffTag::Replace => {
- // This is the text from the `new` rope that should be
- // inserted into `old`.
- let text: &str = {
- let start = new.char_to_byte(new_range.start);
- let end = new.char_to_byte(new_range.end);
- &new_converted[start..end]
- };
- Some((old_pos, pos, Some(text.into())))
+ .map(|&it| self.file.interner[it].len_chars())
+ .sum();
+ self.res.retain(len);
+ self.pos = before.end;
+
+ // do not perform diffs on large hunks
+ let len_before = before.end - before.start;
+ let len_after = after.end - after.start;
+
+ // Pure insertions/removals do not require a character diff.
+ // Very large changes are ignored because their character diff is expensive to compute
+ // TODO adjust heuristic to detect large changes?
+ if len_before == 0
+ || len_after == 0
+ || len_after > 5 * len_before
+ || 5 * len_after < len_before && len_before > 10
+ || len_before + len_after > 200
+ {
+ let remove = self.file.before[before.start as usize..before.end as usize]
+ .iter()
+ .map(|&it| self.file.interner[it].len_chars())
+ .sum();
+ self.res.delete(remove);
+ let mut fragment = Tendril::new();
+ if len_after > 500 {
+ // copying a rope line by line is slower then copying the entire
+ // rope. Use to_string for very large changes instead..
+ if self.file.after.len() == after.end as usize {
+ if after.start == 0 {
+ fragment = self.after.to_string().into();
+ } else {
+ let start = self.after.line_to_char(after.start as usize);
+ fragment = self.after.slice(start..).to_string().into();
}
- similar::DiffTag::Delete => Some((old_pos, pos, None)),
- similar::DiffTag::Equal => None,
+ } else if after.start == 0 {
+ let end = self.after.line_to_char(after.end as usize);
+ fragment = self.after.slice(..end).to_string().into();
+ } else {
+ let start = self.after.line_to_char(after.start as usize);
+ let end = self.after.line_to_char(after.end as usize);
+ fragment = self.after.slice(start..end).to_string().into();
}
- }),
- )
+ } else {
+ for &line in &self.file.after[after.start as usize..after.end as usize] {
+ for chunk in self.file.interner[line].chunks() {
+ fragment.push_str(chunk)
+ }
+ }
+ };
+ self.res.insert(fragment);
+ } else {
+ // for reasonably small hunks, generating a ChangeSet from char diff can save memory
+ // TODO use a tokenizer (word diff?) for improved performance
+ let hunk_before = self.file.before[before.start as usize..before.end as usize]
+ .iter()
+ .flat_map(|&it| self.file.interner[it].chars());
+ let hunk_after = self.file.after[after.start as usize..after.end as usize]
+ .iter()
+ .flat_map(|&it| self.file.interner[it].chars());
+ self.current_hunk.update_before(hunk_before);
+ self.current_hunk.update_after(hunk_after);
+
+ // the histogram heuristic does not work as well
+ // for characters because the same characters often reoccur
+ // use myer diff instead
+ imara_diff::diff(
+ Algorithm::Myers,
+ &self.current_hunk,
+ CharChangeSetBuilder {
+ res: &mut self.res,
+ hunk: &self.current_hunk,
+ pos: 0,
+ },
+ );
+
+ self.current_hunk.clear();
+ }
+ }
+
+ fn finish(mut self) -> Self::Out {
+ let len = self.file.before[self.pos as usize..]
+ .iter()
+ .map(|&it| self.file.interner[it].len_chars())
+ .sum();
+
+ self.res.retain(len);
+ self.res
+ }
+}
+
+struct RopeLines<'a>(RopeSlice<'a>);
+
+impl<'a> imara_diff::intern::TokenSource for RopeLines<'a> {
+ type Token = RopeSlice<'a>;
+ // TODO: improve performance of lines iterator (https://github.com/cessen/ropey/issues/25)
+ type Tokenizer = ropey::iter::Lines<'a>;
+
+ fn tokenize(&self) -> Self::Tokenizer {
+ self.0.lines()
+ }
+
+ fn estimate_tokens(&self) -> u32 {
+ // we can provide a perfect estimate which is very nice for performance
+ self.0.len_lines() as u32
+ }
+}
+
+/// Compares `old` and `new` to generate a [`Transaction`] describing
+/// the steps required to get from `old` to `new`.
+pub fn compare_ropes(before: &Rope, after: &Rope) -> Transaction {
+ let start = Instant::now();
+ let res = ChangeSet::with_capacity(32);
+ let after = after.slice(..);
+ let file = InternedInput::new(RopeLines(before.slice(..)), RopeLines(after));
+ let builder = LineChangeSetBuilder {
+ res,
+ file: &file,
+ after,
+ pos: 0,
+ current_hunk: InternedInput::default(),
+ };
+
+ let res = imara_diff::diff(Algorithm::Histogram, &file, builder).into();
+
+ log::debug!(
+ "rope diff took {}s",
+ Instant::now().duration_since(start).as_secs_f64()
+ );
+ res
}
#[cfg(test)]
mod tests {
use super::*;
+ fn test_identity(a: &str, b: &str) {
+ let mut old = Rope::from(a);
+ let new = Rope::from(b);
+ compare_ropes(&old, &new).apply(&mut old);
+ assert_eq!(old, new);
+ }
+
quickcheck::quickcheck! {
fn test_compare_ropes(a: String, b: String) -> bool {
let mut old = Rope::from(a);
@@ -61,4 +198,25 @@ mod tests {
old == new
}
}
+
+ #[test]
+ fn equal_files() {
+ test_identity("foo", "foo");
+ }
+
+ #[test]
+ fn trailing_newline() {
+ test_identity("foo\n", "foo");
+ test_identity("foo", "foo\n");
+ }
+
+ #[test]
+ fn new_file() {
+ test_identity("", "foo");
+ }
+
+ #[test]
+ fn deleted_file() {
+ test_identity("foo", "");
+ }
}