From f88d4c1e20d4dbc244599ad3f3a5f301bec239bf Mon Sep 17 00:00:00 2001 From: Nathan Vegdahl Date: Thu, 29 Jul 2021 12:10:59 -0700 Subject: Move indent-style code into `helix_core::indent`. --- helix-core/src/indent.rs | 169 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 168 insertions(+), 1 deletion(-) (limited to 'helix-core/src') diff --git a/helix-core/src/indent.rs b/helix-core/src/indent.rs index 5ae66769..d272dd68 100644 --- a/helix-core/src/indent.rs +++ b/helix-core/src/indent.rs @@ -1,10 +1,177 @@ use crate::{ + chars::{char_is_line_ending, char_is_whitespace}, find_first_non_whitespace_char, syntax::{IndentQuery, LanguageConfiguration, Syntax}, tree_sitter::Node, - RopeSlice, + Rope, RopeSlice, }; +/// Enum representing indentation style. +/// +/// Only values 1-8 are valid for the `Spaces` variant. +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub enum IndentStyle { + Tabs, + Spaces(u8), +} + +impl IndentStyle { + /// Creates an `IndentStyle` from an indentation string. + /// + /// For example, passing `" "` (four spaces) will create `IndentStyle::Spaces(4)`. + #[allow(clippy::should_implement_trait)] + #[inline] + pub fn from_str(indent: &str) -> Self { + // XXX: do we care about validating the input more than this? Probably not...? + debug_assert!(!indent.is_empty() && indent.len() <= 8); + + if indent.starts_with(' ') { + IndentStyle::Spaces(indent.len() as u8) + } else { + IndentStyle::Tabs + } + } + + #[inline] + pub fn as_str(&self) -> &'static str { + match *self { + IndentStyle::Tabs => "\t", + IndentStyle::Spaces(1) => " ", + IndentStyle::Spaces(2) => " ", + IndentStyle::Spaces(3) => " ", + IndentStyle::Spaces(4) => " ", + IndentStyle::Spaces(5) => " ", + IndentStyle::Spaces(6) => " ", + IndentStyle::Spaces(7) => " ", + IndentStyle::Spaces(8) => " ", + + // Unsupported indentation style. This should never happen, + // but just in case fall back to two spaces. + IndentStyle::Spaces(n) => { + debug_assert!(n > 0 && n <= 8); // Always triggers. `debug_panic!()` wanted. + " " + } + } + } +} + +/// Attempts to detect the indentation style used in a document. +/// +/// Returns the indentation style if the auto-detect confidence is +/// reasonably high, otherwise returns `None`. +pub fn auto_detect_indent_style(document_text: &Rope) -> Option { + // Build a histogram of the indentation *increases* between + // subsequent lines, ignoring lines that are all whitespace. + // + // Index 0 is for tabs, the rest are 1-8 spaces. + let histogram: [usize; 9] = { + let mut histogram = [0; 9]; + let mut prev_line_is_tabs = false; + let mut prev_line_leading_count = 0usize; + + // Loop through the lines, checking for and recording indentation + // increases as we go. + 'outer: for line in document_text.lines().take(1000) { + let mut c_iter = line.chars(); + + // Is first character a tab or space? + let is_tabs = match c_iter.next() { + Some('\t') => true, + Some(' ') => false, + + // Ignore blank lines. + Some(c) if char_is_line_ending(c) => continue, + + _ => { + prev_line_is_tabs = false; + prev_line_leading_count = 0; + continue; + } + }; + + // Count the line's total leading tab/space characters. + let mut leading_count = 1; + let mut count_is_done = false; + for c in c_iter { + match c { + '\t' if is_tabs && !count_is_done => leading_count += 1, + ' ' if !is_tabs && !count_is_done => leading_count += 1, + + // We stop counting if we hit whitespace that doesn't + // qualify as indent or doesn't match the leading + // whitespace, but we don't exit the loop yet because + // we still want to determine if the line is blank. + c if char_is_whitespace(c) => count_is_done = true, + + // Ignore blank lines. + c if char_is_line_ending(c) => continue 'outer, + + _ => break, + } + + // Bound the worst-case execution time for weird text files. + if leading_count > 256 { + continue 'outer; + } + } + + // If there was an increase in indentation over the previous + // line, update the histogram with that increase. + if (prev_line_is_tabs == is_tabs || prev_line_leading_count == 0) + && prev_line_leading_count < leading_count + { + if is_tabs { + histogram[0] += 1; + } else { + let amount = leading_count - prev_line_leading_count; + if amount <= 8 { + histogram[amount] += 1; + } + } + } + + // Store this line's leading whitespace info for use with + // the next line. + prev_line_is_tabs = is_tabs; + prev_line_leading_count = leading_count; + } + + // Give more weight to tabs, because their presence is a very + // strong indicator. + histogram[0] *= 2; + + histogram + }; + + // Find the most frequent indent, its frequency, and the frequency of + // the next-most frequent indent. + let indent = histogram + .iter() + .enumerate() + .max_by_key(|kv| kv.1) + .unwrap() + .0; + let indent_freq = histogram[indent]; + let indent_freq_2 = *histogram + .iter() + .enumerate() + .filter(|kv| kv.0 != indent) + .map(|kv| kv.1) + .max() + .unwrap(); + + // Return the the auto-detected result if we're confident enough in its + // accuracy, based on some heuristics. + if indent_freq >= 1 && (indent_freq_2 as f64 / indent_freq as f64) < 0.66 { + Some(match indent { + 0 => IndentStyle::Tabs, + _ => IndentStyle::Spaces(indent as u8), + }) + } else { + None + } +} + /// To determine indentation of a newly inserted line, figure out the indentation at the last col /// of the previous line. #[allow(dead_code)] -- cgit v1.2.3-70-g09d2