From acced82be66c9a3920b8147bdd8fa6785388e8c8 Mon Sep 17 00:00:00 2001 From: ammkrn Date: Sat, 6 Nov 2021 20:31:12 -0500 Subject: feat(book/src/languages.md) (#979) * feat(book/src/languages.md) Add a section in the book about language-specific settings and the languages.toml file. * Update book/src/languages.md Co-authored-by: Gokul Soumya * feat(book/src/guides/adding_languages.md) Add book section on adding a new language to the compile-time/root languages.toml file. * Update book/src/guides/adding_languages.md Co-authored-by: Blaž Hrastnik * Update book/src/guides/adding_languages.md Co-authored-by: Blaž Hrastnik * refactor(revise book/src/languages.md) Change the book page on language settings to match suggestions by archseer and mention both toml files. Co-authored-by: Gokul Soumya Co-authored-by: Blaž Hrastnik --- book/src/guides/adding_languages.md | 52 +++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 book/src/guides/adding_languages.md (limited to 'book/src/guides') diff --git a/book/src/guides/adding_languages.md b/book/src/guides/adding_languages.md new file mode 100644 index 00000000..85c3d0e8 --- /dev/null +++ b/book/src/guides/adding_languages.md @@ -0,0 +1,52 @@ +# Adding languages + +## Submodules + +To add a new langauge, you should first add a tree-sitter submodule. To do this, you can run the command +```sh +$ git submodule add -f helix-syntax/languages/tree-sitter- +``` +For example, to add tree-sitter-ocaml you would run +```sh +$ git submodule add -f https://github.com/tree-sitter/tree-sitter-ocaml helix-syntax/languages/tree-sitter-ocaml +``` +Make sure the submodule is shallow by doing +```sh +git config -f .gitmodules submodule.helix-syntax/languages/tree-sitter-.shallow true +``` + +or you can manually add `shallow = true` to `.gitmodules`. + +## languages.toml + +Next, you need to add the language to the `languages.toml` found in the root of the repository; this `languages.toml` file is included at compilation time, and is distinct from the `language.toml` file in the user's [configuration directory](../configuration.md). + +These are the available keys and descriptions for the file. + +| Key | Description | +| ---- | ----------- | +| name | The name of the language | +| scope | A string like `source.js` that identifies the language. Currently, we strive to match the scope names used by popular TextMate grammars and by the Linguist library. Usually `source.` or `text.` in case of markup languages | +| injection-regex | regex pattern that will be tested against a language name in order to determine whether this language should be used for a potential language injection site. [link](https://tree-sitter.github.io/tree-sitter/syntax-highlighting#language-injection) | +| file-types | The filetypes of the language, for example `["yml", "yaml"]` | +| roots | A set of marker files to look for when trying to find the workspace root. For example `Cargo.lock`, `yarn.lock` | +| auto-format | Whether to autoformat this language when saving | +| comment-token | The token to use as a comment-token | +| indent | The indent to use. Has sub keys `tab-width` and `unit` | +| config | Language server configuration | + +## Queries + +For a language to have syntax-highlighting and indentation among other things, you have to add queries. Add a directory for your language with the path `runtime/queries//`. The tree-sitter [website](https://tree-sitter.github.io/tree-sitter/syntax-highlighting#queries) gives more info on how to write queries. + +## Common Issues + +- If you get errors when building after switching branches, you may have to remove or update tree-sitter submodules. You can update submodules by running +```sh +$ git submodule update --init +``` +- Make sure to not use the `--remote` flag. To remove submodules look inside the `.gitmodules` and remove directories that are not present inside of it. + +- If a parser is segfaulting or you want to remove the parser, make sure to remove the submodule *and* the compiled parser in `runtime/grammar/.so` + +- The indents query is `indents.toml`, *not* `indents.scm`. See [this](https://github.com/helix-editor/helix/issues/114) issue for more information. -- cgit v1.2.3-70-g09d2 From 29fe504398e483b8830564727ee8e2f209f06615 Mon Sep 17 00:00:00 2001 From: Blaž Hrastnik Date: Sun, 7 Nov 2021 10:33:05 +0900 Subject: book: Mention git submodule sync too --- book/src/guides/adding_languages.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'book/src/guides') diff --git a/book/src/guides/adding_languages.md b/book/src/guides/adding_languages.md index 85c3d0e8..00902876 100644 --- a/book/src/guides/adding_languages.md +++ b/book/src/guides/adding_languages.md @@ -43,7 +43,7 @@ For a language to have syntax-highlighting and indentation among other things, y - If you get errors when building after switching branches, you may have to remove or update tree-sitter submodules. You can update submodules by running ```sh -$ git submodule update --init +$ git submodule sync; git submodule update --init ``` - Make sure to not use the `--remote` flag. To remove submodules look inside the `.gitmodules` and remove directories that are not present inside of it. -- cgit v1.2.3-70-g09d2 From e0e227d172697c0d3c418704fd20e780ee1a1032 Mon Sep 17 00:00:00 2001 From: Gokul Soumya Date: Mon, 8 Nov 2021 06:22:51 +0530 Subject: Touch up docs for adding new language (#1002) --- book/src/guides/adding_languages.md | 23 +++++++++++++++-------- book/src/guides/textobject.md | 2 +- 2 files changed, 16 insertions(+), 9 deletions(-) (limited to 'book/src/guides') diff --git a/book/src/guides/adding_languages.md b/book/src/guides/adding_languages.md index 00902876..c606f8fc 100644 --- a/book/src/guides/adding_languages.md +++ b/book/src/guides/adding_languages.md @@ -2,13 +2,14 @@ ## Submodules -To add a new langauge, you should first add a tree-sitter submodule. To do this, you can run the command +To add a new langauge, you should first add a tree-sitter submodule. To do this, +you can run the command ```sh -$ git submodule add -f helix-syntax/languages/tree-sitter- +git submodule add -f helix-syntax/languages/tree-sitter- ``` For example, to add tree-sitter-ocaml you would run ```sh -$ git submodule add -f https://github.com/tree-sitter/tree-sitter-ocaml helix-syntax/languages/tree-sitter-ocaml +git submodule add -f https://github.com/tree-sitter/tree-sitter-ocaml helix-syntax/languages/tree-sitter-ocaml ``` Make sure the submodule is shallow by doing ```sh @@ -19,7 +20,10 @@ or you can manually add `shallow = true` to `.gitmodules`. ## languages.toml -Next, you need to add the language to the `languages.toml` found in the root of the repository; this `languages.toml` file is included at compilation time, and is distinct from the `language.toml` file in the user's [configuration directory](../configuration.md). +Next, you need to add the language to the [`languages.toml`][languages.toml] found in the root of +the repository; this `languages.toml` file is included at compilation time, and +is distinct from the `language.toml` file in the user's [configuration +directory](../configuration.md). These are the available keys and descriptions for the file. @@ -27,7 +31,7 @@ These are the available keys and descriptions for the file. | ---- | ----------- | | name | The name of the language | | scope | A string like `source.js` that identifies the language. Currently, we strive to match the scope names used by popular TextMate grammars and by the Linguist library. Usually `source.` or `text.` in case of markup languages | -| injection-regex | regex pattern that will be tested against a language name in order to determine whether this language should be used for a potential language injection site. [link](https://tree-sitter.github.io/tree-sitter/syntax-highlighting#language-injection) | +| injection-regex | regex pattern that will be tested against a language name in order to determine whether this language should be used for a potential [language injection][treesitter-language-injection] site. | | file-types | The filetypes of the language, for example `["yml", "yaml"]` | | roots | A set of marker files to look for when trying to find the workspace root. For example `Cargo.lock`, `yarn.lock` | | auto-format | Whether to autoformat this language when saving | @@ -42,11 +46,14 @@ For a language to have syntax-highlighting and indentation among other things, y ## Common Issues - If you get errors when building after switching branches, you may have to remove or update tree-sitter submodules. You can update submodules by running -```sh -$ git submodule sync; git submodule update --init -``` + ```sh + git submodule sync; git submodule update --init + ``` - Make sure to not use the `--remote` flag. To remove submodules look inside the `.gitmodules` and remove directories that are not present inside of it. - If a parser is segfaulting or you want to remove the parser, make sure to remove the submodule *and* the compiled parser in `runtime/grammar/.so` - The indents query is `indents.toml`, *not* `indents.scm`. See [this](https://github.com/helix-editor/helix/issues/114) issue for more information. + +[treesitter-language-injection]: https://tree-sitter.github.io/tree-sitter/syntax-highlighting#language-injection +[languages.toml]: https://github.com/helix-editor/helix/blob/master/languages.toml diff --git a/book/src/guides/textobject.md b/book/src/guides/textobject.md index 50b3b574..dd726b7c 100644 --- a/book/src/guides/textobject.md +++ b/book/src/guides/textobject.md @@ -5,7 +5,7 @@ require an accompanying tree-sitter grammar and a `textobjects.scm` query file to work properly. Tree-sitter allows us to query the source code syntax tree and capture specific parts of it. The queries are written in a lisp dialect. More information on how to write queries can be found in the [official tree-sitter -documentation](tree-sitter-queries). +documentation][tree-sitter-queries]. Query files should be placed in `runtime/queries/{language}/textobjects.scm` when contributing. Note that to test the query files locally you should put -- cgit v1.2.3-70-g09d2 From 77dbbc73f9c9b6599bc39b18625285685fe2e4b1 Mon Sep 17 00:00:00 2001 From: ath3 Date: Mon, 8 Nov 2021 16:19:44 +0100 Subject: Detect filetype from shebang line (#1001) --- book/src/guides/adding_languages.md | 3 ++- helix-core/src/indent.rs | 1 + helix-core/src/syntax.rs | 24 ++++++++++++++++++++++++ helix-view/src/document.rs | 4 +++- languages.toml | 35 +++++++++++++++++++++++++++++++++++ 5 files changed, 65 insertions(+), 2 deletions(-) (limited to 'book/src/guides') diff --git a/book/src/guides/adding_languages.md b/book/src/guides/adding_languages.md index c606f8fc..446eb479 100644 --- a/book/src/guides/adding_languages.md +++ b/book/src/guides/adding_languages.md @@ -33,10 +33,11 @@ These are the available keys and descriptions for the file. | scope | A string like `source.js` that identifies the language. Currently, we strive to match the scope names used by popular TextMate grammars and by the Linguist library. Usually `source.` or `text.` in case of markup languages | | injection-regex | regex pattern that will be tested against a language name in order to determine whether this language should be used for a potential [language injection][treesitter-language-injection] site. | | file-types | The filetypes of the language, for example `["yml", "yaml"]` | +| shebangs | The interpreters from the shebang line, for example `["sh", "bash"]` | | roots | A set of marker files to look for when trying to find the workspace root. For example `Cargo.lock`, `yarn.lock` | | auto-format | Whether to autoformat this language when saving | | comment-token | The token to use as a comment-token | -| indent | The indent to use. Has sub keys `tab-width` and `unit` | +| indent | The indent to use. Has sub keys `tab-width` and `unit` | | config | Language server configuration | ## Queries diff --git a/helix-core/src/indent.rs b/helix-core/src/indent.rs index 20f034ea..b6f5081a 100644 --- a/helix-core/src/indent.rs +++ b/helix-core/src/indent.rs @@ -450,6 +450,7 @@ where language: vec![LanguageConfiguration { scope: "source.rust".to_string(), file_types: vec!["rs".to_string()], + shebangs: vec![], language_id: "Rust".to_string(), highlight_config: OnceCell::new(), config: None, diff --git a/helix-core/src/syntax.rs b/helix-core/src/syntax.rs index f3e3f238..84952248 100644 --- a/helix-core/src/syntax.rs +++ b/helix-core/src/syntax.rs @@ -14,6 +14,8 @@ use std::{ cell::RefCell, collections::{HashMap, HashSet}, fmt, + fs::File, + io::Read, path::Path, sync::Arc, }; @@ -52,6 +54,7 @@ pub struct LanguageConfiguration { pub language_id: String, pub scope: String, // source.rust pub file_types: Vec, // filename ends_with? + pub shebangs: Vec, // interpreter(s) associated with language pub roots: Vec, // these indicate project roots <.git, Cargo.toml> pub comment_token: Option, @@ -254,6 +257,7 @@ pub struct Loader { // highlight_names ? language_configs: Vec>, language_config_ids_by_file_type: HashMap, // Vec + language_config_ids_by_shebang: HashMap, } impl Loader { @@ -261,6 +265,7 @@ impl Loader { let mut loader = Self { language_configs: Vec::new(), language_config_ids_by_file_type: HashMap::new(), + language_config_ids_by_shebang: HashMap::new(), }; for config in config.language { @@ -273,6 +278,11 @@ impl Loader { .language_config_ids_by_file_type .insert(file_type.clone(), language_id); } + for shebang in &config.shebangs { + loader + .language_config_ids_by_shebang + .insert(shebang.clone(), language_id); + } loader.language_configs.push(Arc::new(config)); } @@ -298,6 +308,20 @@ impl Loader { // TODO: content_regex handling conflict resolution } + pub fn language_config_for_shebang(&self, path: &Path) -> Option> { + // Read the first 128 bytes of the file. If its a shebang line, try to find the language + let file = File::open(path).ok()?; + let mut buf = String::with_capacity(128); + file.take(128).read_to_string(&mut buf).ok()?; + static SHEBANG_REGEX: Lazy = + Lazy::new(|| Regex::new(r"^#!\s*(?:\S*[/\\](?:env\s+)?)?([^\s\.\d]+)").unwrap()); + let configuration_id = SHEBANG_REGEX + .captures(&buf) + .and_then(|cap| self.language_config_ids_by_shebang.get(&cap[1])); + + configuration_id.and_then(|&id| self.language_configs.get(id).cloned()) + } + pub fn language_config_for_scope(&self, scope: &str) -> Option> { self.language_configs .iter() diff --git a/helix-view/src/document.rs b/helix-view/src/document.rs index ce5df8ee..a68ab759 100644 --- a/helix-view/src/document.rs +++ b/helix-view/src/document.rs @@ -494,7 +494,9 @@ impl Document { /// Detect the programming language based on the file type. pub fn detect_language(&mut self, theme: Option<&Theme>, config_loader: &syntax::Loader) { if let Some(path) = &self.path { - let language_config = config_loader.language_config_for_file_name(path); + let language_config = config_loader + .language_config_for_file_name(path) + .or_else(|| config_loader.language_config_for_shebang(path)); self.set_language(theme, language_config); } } diff --git a/languages.toml b/languages.toml index 98892171..067138e4 100644 --- a/languages.toml +++ b/languages.toml @@ -3,6 +3,7 @@ name = "rust" scope = "source.rust" injection-regex = "rust" file-types = ["rs"] +shebangs = [] roots = [] auto-format = true comment-token = "//" @@ -17,6 +18,7 @@ name = "toml" scope = "source.toml" injection-regex = "toml" file-types = ["toml"] +shebangs = [] roots = [] comment-token = "#" @@ -27,6 +29,7 @@ name = "protobuf" scope = "source.proto" injection-regex = "protobuf" file-types = ["proto"] +shebangs = [] roots = [] comment-token = "//" @@ -37,6 +40,7 @@ name = "elixir" scope = "source.elixir" injection-regex = "elixir" file-types = ["ex", "exs"] +shebangs = [] roots = [] comment-token = "#" @@ -48,6 +52,7 @@ name = "mint" scope = "source.mint" injection-regex = "mint" file-types = ["mint"] +shebangs = [] roots = [] comment-token = "//" @@ -59,6 +64,7 @@ name = "json" scope = "source.json" injection-regex = "json" file-types = ["json"] +shebangs = [] roots = [] indent = { tab-width = 2, unit = " " } @@ -68,6 +74,7 @@ name = "c" scope = "source.c" injection-regex = "c" file-types = ["c"] # TODO: ["h"] +shebangs = [] roots = [] comment-token = "//" @@ -79,6 +86,7 @@ name = "cpp" scope = "source.cpp" injection-regex = "cpp" file-types = ["cc", "hh", "cpp", "hpp", "h", "ipp", "tpp", "cxx", "hxx", "ixx", "txx", "ino"] +shebangs = [] roots = [] comment-token = "//" @@ -90,6 +98,7 @@ name = "c-sharp" scope = "source.csharp" injection-regex = "c-?sharp" file-types = ["cs"] +shebangs = [] roots = [] comment-token = "//" @@ -100,6 +109,7 @@ name = "go" scope = "source.go" injection-regex = "go" file-types = ["go"] +shebangs = [] roots = ["Gopkg.toml", "go.mod"] auto-format = true comment-token = "//" @@ -113,6 +123,7 @@ name = "javascript" scope = "source.js" injection-regex = "^(js|javascript)$" file-types = ["js", "mjs"] +shebangs = [] roots = [] comment-token = "//" # TODO: highlights-jsx, highlights-params @@ -124,6 +135,7 @@ name = "typescript" scope = "source.ts" injection-regex = "^(ts|typescript)$" file-types = ["ts"] +shebangs = [] roots = [] # TODO: highlights-jsx, highlights-params @@ -135,6 +147,7 @@ name = "tsx" scope = "source.tsx" injection-regex = "^(tsx)$" # |typescript file-types = ["tsx"] +shebangs = [] roots = [] # TODO: highlights-jsx, highlights-params @@ -146,6 +159,7 @@ name = "css" scope = "source.css" injection-regex = "css" file-types = ["css"] +shebangs = [] roots = [] indent = { tab-width = 2, unit = " " } @@ -155,6 +169,7 @@ name = "html" scope = "text.html.basic" injection-regex = "html" file-types = ["html"] +shebangs = [] roots = [] indent = { tab-width = 2, unit = " " } @@ -164,6 +179,7 @@ name = "python" scope = "source.python" injection-regex = "python" file-types = ["py"] +shebangs = ["python"] roots = [] comment-token = "#" @@ -176,6 +192,7 @@ name = "nix" scope = "source.nix" injection-regex = "nix" file-types = ["nix"] +shebangs = [] roots = [] comment-token = "#" @@ -187,6 +204,7 @@ name = "ruby" scope = "source.ruby" injection-regex = "ruby" file-types = ["rb"] +shebangs = ["ruby"] roots = [] comment-token = "#" @@ -198,6 +216,7 @@ name = "bash" scope = "source.bash" injection-regex = "bash" file-types = ["sh", "bash"] +shebangs = ["sh", "bash", "dash"] roots = [] comment-token = "#" @@ -209,6 +228,7 @@ name = "php" scope = "source.php" injection-regex = "php" file-types = ["php"] +shebangs = ["php"] roots = [] indent = { tab-width = 4, unit = " " } @@ -218,6 +238,7 @@ name = "latex" scope = "source.tex" injection-regex = "tex" file-types = ["tex"] +shebangs = [] roots = [] comment-token = "%" @@ -228,6 +249,7 @@ name = "julia" scope = "source.julia" injection-regex = "julia" file-types = ["jl"] +shebangs = [] roots = [] comment-token = "#" language-server = { command = "julia", args = [ @@ -253,6 +275,7 @@ name = "java" scope = "source.java" injection-regex = "java" file-types = ["java"] +shebangs = [] roots = [] indent = { tab-width = 4, unit = " " } @@ -261,6 +284,7 @@ name = "ledger" scope = "source.ledger" injection-regex = "ledger" file-types = ["ldg", "ledger", "journal"] +shebangs = [] roots = [] comment-token = ";" indent = { tab-width = 4, unit = " " } @@ -270,6 +294,7 @@ name = "ocaml" scope = "source.ocaml" injection-regex = "ocaml" file-types = ["ml"] +shebangs = [] roots = [] comment-token = "(**)" indent = { tab-width = 2, unit = " " } @@ -278,6 +303,7 @@ indent = { tab-width = 2, unit = " " } name = "ocaml-interface" scope = "source.ocaml.interface" file-types = ["mli"] +shebangs = [] roots = [] comment-token = "(**)" indent = { tab-width = 2, unit = " "} @@ -286,6 +312,7 @@ indent = { tab-width = 2, unit = " "} name = "lua" scope = "source.lua" file-types = ["lua"] +shebangs = [] roots = [] comment-token = "--" indent = { tab-width = 2, unit = " " } @@ -295,6 +322,7 @@ name = "svelte" scope = "source.svelte" injection-regex = "svelte" file-types = ["svelte"] +shebangs = [] roots = [] indent = { tab-width = 2, unit = " " } language-server = { command = "svelteserver", args = ["--stdio"] } @@ -305,6 +333,7 @@ name = "vue" scope = "source.vue" injection-regex = "vue" file-types = ["vue"] +shebangs = [] roots = [] indent = { tab-width = 2, unit = " " } @@ -312,6 +341,7 @@ indent = { tab-width = 2, unit = " " } name = "yaml" scope = "source.yaml" file-types = ["yml", "yaml"] +shebangs = [] roots = [] comment-token = "#" indent = { tab-width = 2, unit = " " } @@ -331,6 +361,7 @@ name = "zig" scope = "source.zig" injection-regex = "zig" file-types = ["zig"] +shebangs = [] roots = ["build.zig"] auto-format = true comment-token = "//" @@ -343,6 +374,7 @@ name = "prolog" scope = "source.prolog" roots = [] file-types = ["pl", "prolog"] +shebangs = ["swipl"] comment-token = "%" language-server = { command = "swipl", args = [ @@ -354,6 +386,7 @@ language-server = { command = "swipl", args = [ name = "tsq" scope = "source.tsq" file-types = ["scm"] +shebangs = [] roots = [] comment-token = ";" indent = { tab-width = 2, unit = " " } @@ -362,6 +395,7 @@ indent = { tab-width = 2, unit = " " } name = "cmake" scope = "source.cmake" file-types = ["cmake", "CMakeLists.txt"] +shebangs = [] roots = [] comment-token = "#" indent = { tab-width = 2, unit = " " } @@ -371,6 +405,7 @@ language-server = { command = "cmake-language-server" } name = "perl" scope = "source.perl" file-types = ["pl", "pm"] +shebangs = ["perl"] roots = [] comment-token = "#" indent = { tab-width = 2, unit = " " } -- cgit v1.2.3-70-g09d2