From 9c6c63a2be30252a6207f4aebb5e0f76f746b4c8 Mon Sep 17 00:00:00 2001 From: Timothy DeHerrera Date: Fri, 28 Apr 2023 03:21:34 -0600 Subject: inject language based on file extension & shebang (#3970) * inject language based on file extension Nodes can now be captured with "injection.filename". If this capture contains a valid file extension known to Helix, then the content will be highlighted as that language. * inject language by shebang Nodes can now be captured with "injection.shebang". If this capture contains a valid shebang line known to Helix, then the content will be highlighted as the language the shebang calls for. * add documentation for language injection * nix: fix highlights The `@` is now highlighted properly on either side of the function arg. Also, extending the phases with `buildPhase = prev.buildPhase + ''''` is now highlighted properly. Fix highlighting of `''$` style escapes (requires tree-sitter-nix bump) Fix `inherit` highlighting. * simplify injection_for_match Split out injection pair logic into its own method to make the overall flow easier to follow. Also transform the top-level function into a method on a HighlightConfiguration. * markdown: add shebang injection query--- helix-core/src/syntax.rs | 207 +++++++++++++++++++++++++++++------------------ 1 file changed, 130 insertions(+), 77 deletions(-) (limited to 'helix-core/src') diff --git a/helix-core/src/syntax.rs b/helix-core/src/syntax.rs index c34ea81a..6514b40f 100644 --- a/helix-core/src/syntax.rs +++ b/helix-core/src/syntax.rs @@ -662,9 +662,8 @@ impl Loader { pub fn language_config_for_shebang(&self, source: &Rope) -> Option> { let line = Cow::from(source.line(0)); - static SHEBANG_REGEX: Lazy = Lazy::new(|| { - Regex::new(r"^#!\s*(?:\S*[/\\](?:env\s+(?:\-\S+\s+)*)?)?([^\s\.\d]+)").unwrap() - }); + static SHEBANG_REGEX: Lazy = + Lazy::new(|| Regex::new(&["^", SHEBANG].concat()).unwrap()); let configuration_id = SHEBANG_REGEX .captures(&line) .and_then(|cap| self.language_config_ids_by_shebang.get(&cap[1])); @@ -686,15 +685,14 @@ impl Loader { .cloned() } - pub fn language_configuration_for_injection_string( - &self, - string: &str, - ) -> Option> { + /// Unlike language_config_for_language_id, which only returns Some for an exact id, this + /// function will perform a regex match on the given string to find the closest language match. + pub fn language_config_for_name(&self, name: &str) -> Option> { let mut best_match_length = 0; let mut best_match_position = None; for (i, configuration) in self.language_configs.iter().enumerate() { if let Some(injection_regex) = &configuration.injection_regex { - if let Some(mat) = injection_regex.find(string) { + if let Some(mat) = injection_regex.find(name) { let length = mat.end() - mat.start(); if length > best_match_length { best_match_position = Some(i); @@ -704,11 +702,20 @@ impl Loader { } } - if let Some(i) = best_match_position { - let configuration = &self.language_configs[i]; - return Some(configuration.clone()); + best_match_position.map(|i| self.language_configs[i].clone()) + } + + pub fn language_configuration_for_injection_string( + &self, + capture: &InjectionLanguageMarker, + ) -> Option> { + match capture { + InjectionLanguageMarker::Name(string) => self.language_config_for_name(string), + InjectionLanguageMarker::Filename(file) => self.language_config_for_file_name(file), + InjectionLanguageMarker::Shebang(shebang) => { + self.language_config_for_language_id(shebang) + } } - None } pub fn language_configs(&self) -> impl Iterator> { @@ -800,7 +807,7 @@ impl Syntax { queue.push_back(self.root); let scopes = self.loader.scopes.load(); - let injection_callback = |language: &str| { + let injection_callback = |language: &InjectionLanguageMarker| { self.loader .language_configuration_for_injection_string(language) .and_then(|language_config| language_config.highlight_config(&scopes)) @@ -961,12 +968,9 @@ impl Syntax { ); let mut injections = Vec::new(); for mat in matches { - let (language_name, content_node, included_children) = injection_for_match( - &layer.config, - &layer.config.injections_query, - &mat, - source_slice, - ); + let (injection_capture, content_node, included_children) = layer + .config + .injection_for_match(&layer.config.injections_query, &mat, source_slice); // Explicitly remove this match so that none of its other captures will remain // in the stream of captures. @@ -974,9 +978,10 @@ impl Syntax { // If a language is found with the given name, then add a new language layer // to the highlighted document. - if let (Some(language_name), Some(content_node)) = (language_name, content_node) + if let (Some(injection_capture), Some(content_node)) = + (injection_capture, content_node) { - if let Some(config) = (injection_callback)(&language_name) { + if let Some(config) = (injection_callback)(&injection_capture) { let ranges = intersect_ranges(&layer.ranges, &[content_node], included_children); @@ -1001,14 +1006,11 @@ impl Syntax { ); for mat in matches { let entry = &mut injections_by_pattern_index[mat.pattern_index]; - let (language_name, content_node, included_children) = injection_for_match( - &layer.config, - combined_injections_query, - &mat, - source_slice, - ); - if language_name.is_some() { - entry.0 = language_name; + let (injection_capture, content_node, included_children) = layer + .config + .injection_for_match(combined_injections_query, &mat, source_slice); + if injection_capture.is_some() { + entry.0 = injection_capture; } if let Some(content_node) = content_node { entry.1.push(content_node); @@ -1395,6 +1397,8 @@ pub struct HighlightConfiguration { non_local_variable_patterns: Vec, injection_content_capture_index: Option, injection_language_capture_index: Option, + injection_filename_capture_index: Option, + injection_shebang_capture_index: Option, local_scope_capture_index: Option, local_def_capture_index: Option, local_def_value_capture_index: Option, @@ -1538,6 +1542,8 @@ impl HighlightConfiguration { // Store the numeric ids for all of the special captures. let mut injection_content_capture_index = None; let mut injection_language_capture_index = None; + let mut injection_filename_capture_index = None; + let mut injection_shebang_capture_index = None; let mut local_def_capture_index = None; let mut local_def_value_capture_index = None; let mut local_ref_capture_index = None; @@ -1558,6 +1564,8 @@ impl HighlightConfiguration { match name.as_str() { "injection.content" => injection_content_capture_index = i, "injection.language" => injection_language_capture_index = i, + "injection.filename" => injection_filename_capture_index = i, + "injection.shebang" => injection_shebang_capture_index = i, _ => {} } } @@ -1573,6 +1581,8 @@ impl HighlightConfiguration { non_local_variable_patterns, injection_content_capture_index, injection_language_capture_index, + injection_filename_capture_index, + injection_shebang_capture_index, local_scope_capture_index, local_def_capture_index, local_def_value_capture_index, @@ -1631,6 +1641,90 @@ impl HighlightConfiguration { self.highlight_indices.store(Arc::new(indices)); } + + fn injection_pair<'a>( + &self, + query_match: &QueryMatch<'a, 'a>, + source: RopeSlice<'a>, + ) -> (Option>, Option>) { + let mut injection_capture = None; + let mut content_node = None; + + for capture in query_match.captures { + let index = Some(capture.index); + if index == self.injection_language_capture_index { + let name = byte_range_to_str(capture.node.byte_range(), source); + injection_capture = Some(InjectionLanguageMarker::Name(name)); + } else if index == self.injection_filename_capture_index { + let name = byte_range_to_str(capture.node.byte_range(), source); + let path = Path::new(name.as_ref()).to_path_buf(); + injection_capture = Some(InjectionLanguageMarker::Filename(path.into())); + } else if index == self.injection_shebang_capture_index { + let node_slice = source.byte_slice(capture.node.byte_range()); + + // some languages allow space and newlines before the actual string content + // so a shebang could be on either the first or second line + let lines = if let Ok(end) = node_slice.try_line_to_byte(2) { + node_slice.byte_slice(..end) + } else { + node_slice + }; + + static SHEBANG_REGEX: Lazy = Lazy::new(|| Regex::new(SHEBANG).unwrap()); + + injection_capture = SHEBANG_REGEX + .captures(&Cow::from(lines)) + .map(|cap| InjectionLanguageMarker::Shebang(cap[1].to_owned())) + } else if index == self.injection_content_capture_index { + content_node = Some(capture.node); + } + } + (injection_capture, content_node) + } + + fn injection_for_match<'a>( + &self, + query: &'a Query, + query_match: &QueryMatch<'a, 'a>, + source: RopeSlice<'a>, + ) -> ( + Option>, + Option>, + IncludedChildren, + ) { + let (mut injection_capture, content_node) = self.injection_pair(query_match, source); + + let mut included_children = IncludedChildren::default(); + for prop in query.property_settings(query_match.pattern_index) { + match prop.key.as_ref() { + // In addition to specifying the language name via the text of a + // captured node, it can also be hard-coded via a `#set!` predicate + // that sets the injection.language key. + "injection.language" if injection_capture.is_none() => { + injection_capture = prop + .value + .as_ref() + .map(|s| InjectionLanguageMarker::Name(s.as_ref().into())); + } + + // By default, injections do not include the *children* of an + // `injection.content` node - only the ranges that belong to the + // node itself. This can be changed using a `#set!` predicate that + // sets the `injection.include-children` key. + "injection.include-children" => included_children = IncludedChildren::All, + + // Some queries might only exclude named children but include unnamed + // children in their `injection.content` node. This can be enabled using + // a `#set!` predicate that sets the `injection.include-unnamed-children` key. + "injection.include-unnamed-children" => { + included_children = IncludedChildren::Unnamed + } + _ => {} + } + } + + (injection_capture, content_node, included_children) + } } impl<'a> HighlightIterLayer<'a> { @@ -2042,56 +2136,15 @@ impl<'a> Iterator for HighlightIter<'a> { } } -fn injection_for_match<'a>( - config: &HighlightConfiguration, - query: &'a Query, - query_match: &QueryMatch<'a, 'a>, - source: RopeSlice<'a>, -) -> (Option>, Option>, IncludedChildren) { - let content_capture_index = config.injection_content_capture_index; - let language_capture_index = config.injection_language_capture_index; - - let mut language_name = None; - let mut content_node = None; - for capture in query_match.captures { - let index = Some(capture.index); - if index == language_capture_index { - let name = byte_range_to_str(capture.node.byte_range(), source); - language_name = Some(name); - } else if index == content_capture_index { - content_node = Some(capture.node); - } - } - - let mut included_children = IncludedChildren::default(); - for prop in query.property_settings(query_match.pattern_index) { - match prop.key.as_ref() { - // In addition to specifying the language name via the text of a - // captured node, it can also be hard-coded via a `#set!` predicate - // that sets the injection.language key. - "injection.language" => { - if language_name.is_none() { - language_name = prop.value.as_ref().map(|s| s.as_ref().into()) - } - } - - // By default, injections do not include the *children* of an - // `injection.content` node - only the ranges that belong to the - // node itself. This can be changed using a `#set!` predicate that - // sets the `injection.include-children` key. - "injection.include-children" => included_children = IncludedChildren::All, - - // Some queries might only exclude named children but include unnamed - // children in their `injection.content` node. This can be enabled using - // a `#set!` predicate that sets the `injection.include-unnamed-children` key. - "injection.include-unnamed-children" => included_children = IncludedChildren::Unnamed, - _ => {} - } - } - - (language_name, content_node, included_children) +#[derive(Debug, Clone)] +pub enum InjectionLanguageMarker<'a> { + Name(Cow<'a, str>), + Filename(Cow<'a, Path>), + Shebang(String), } +const SHEBANG: &str = r"#!\s*(?:\S*[/\\](?:env\s+(?:\-\S+\s+)*)?)?([^\s\.\d]+)"; + pub struct Merge { iter: I, spans: Box)>>, -- cgit v1.2.3-70-g09d2