aboutsummaryrefslogtreecommitdiff
path: root/helix-loader
diff options
context:
space:
mode:
authorMichael Davis2022-02-16 13:57:20 +0000
committerBlaž Hrastnik2022-03-10 08:31:57 +0000
commit4fc991fdeca5db36bd7be7197510e62a019e1677 (patch)
tree03ce0022ba5f6aa71adf1c81214d05db8a84f035 /helix-loader
parent08ee949dcb904dc27aa41a62ad686c14c0a406bb (diff)
migrate grammar fetching/building code into helix-loader crate
This is a rather large refactor that moves most of the code for loading, fetching, and building grammars into a new helix-loader module. This works well with the [[grammars]] syntax for languages.toml defined earlier: we only have to depend on the types for GrammarConfiguration in helix-loader and can leave all the [[language]] entries for helix-core.
Diffstat (limited to 'helix-loader')
-rw-r--r--helix-loader/Cargo.toml23
-rw-r--r--helix-loader/build.rs6
-rw-r--r--helix-loader/src/grammar.rs388
-rw-r--r--helix-loader/src/lib.rs161
4 files changed, 578 insertions, 0 deletions
diff --git a/helix-loader/Cargo.toml b/helix-loader/Cargo.toml
new file mode 100644
index 00000000..21b37333
--- /dev/null
+++ b/helix-loader/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "helix-loader"
+version = "0.6.0"
+description = "A post-modern text editor."
+authors = ["Blaž Hrastnik <blaz@mxxn.io>"]
+edition = "2021"
+license = "MPL-2.0"
+categories = ["editor"]
+repository = "https://github.com/helix-editor/helix"
+homepage = "https://helix-editor.com"
+
+[dependencies]
+anyhow = "1"
+serde = { version = "1.0", features = ["derive"] }
+toml = "0.5"
+etcetera = "0.3"
+tree-sitter = "0.20"
+libloading = "0.7"
+once_cell = "1.9"
+
+# cloning/compiling tree-sitter grammars
+cc = { version = "1" }
+threadpool = { version = "1.0" }
diff --git a/helix-loader/build.rs b/helix-loader/build.rs
new file mode 100644
index 00000000..e0ebd1c4
--- /dev/null
+++ b/helix-loader/build.rs
@@ -0,0 +1,6 @@
+fn main() {
+ println!(
+ "cargo:rustc-env=BUILD_TARGET={}",
+ std::env::var("TARGET").unwrap()
+ );
+}
diff --git a/helix-loader/src/grammar.rs b/helix-loader/src/grammar.rs
new file mode 100644
index 00000000..61ef464f
--- /dev/null
+++ b/helix-loader/src/grammar.rs
@@ -0,0 +1,388 @@
+use anyhow::{anyhow, Context, Result};
+use libloading::{Library, Symbol};
+use serde::{Deserialize, Serialize};
+use std::fs;
+use std::time::SystemTime;
+use std::{
+ collections::HashSet,
+ path::{Path, PathBuf},
+ process::Command,
+ sync::mpsc::channel,
+};
+use tree_sitter::Language;
+
+#[cfg(unix)]
+const DYLIB_EXTENSION: &str = "so";
+
+#[cfg(windows)]
+const DYLIB_EXTENSION: &str = "dll";
+
+#[derive(Debug, Serialize, Deserialize)]
+struct Configuration {
+ #[serde(rename = "use-grammars")]
+ pub grammar_selection: Option<GrammarSelection>,
+ pub grammar: Vec<GrammarConfiguration>,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+#[serde(rename_all = "lowercase", untagged)]
+pub enum GrammarSelection {
+ Only(HashSet<String>),
+ Except(HashSet<String>),
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+#[serde(deny_unknown_fields)]
+pub struct GrammarConfiguration {
+ #[serde(rename = "name")]
+ pub grammar_id: String,
+ pub source: GrammarSource,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+#[serde(rename_all = "lowercase", untagged)]
+pub enum GrammarSource {
+ Local {
+ path: String,
+ },
+ Git {
+ #[serde(rename = "git")]
+ remote: String,
+ #[serde(rename = "rev")]
+ revision: String,
+ subpath: Option<String>,
+ },
+}
+
+const BUILD_TARGET: &str = env!("BUILD_TARGET");
+const REMOTE_NAME: &str = "origin";
+
+pub fn get_language(name: &str) -> Result<Language> {
+ let name = name.to_ascii_lowercase();
+ let mut library_path = crate::runtime_dir().join("grammars").join(&name);
+ library_path.set_extension(DYLIB_EXTENSION);
+
+ let library = unsafe { Library::new(&library_path) }
+ .with_context(|| format!("Error opening dynamic library {library_path:?}"))?;
+ let language_fn_name = format!("tree_sitter_{}", name.replace('-', "_"));
+ let language = unsafe {
+ let language_fn: Symbol<unsafe extern "C" fn() -> Language> = library
+ .get(language_fn_name.as_bytes())
+ .with_context(|| format!("Failed to load symbol {language_fn_name}"))?;
+ language_fn()
+ };
+ std::mem::forget(library);
+ Ok(language)
+}
+
+pub fn fetch_grammars() -> Result<()> {
+ run_parallel(get_grammar_configs()?, fetch_grammar, "fetch")
+}
+
+pub fn build_grammars() -> Result<()> {
+ run_parallel(get_grammar_configs()?, build_grammar, "build")
+}
+
+// Returns the set of grammar configurations the user requests.
+// Grammars are configured in the default and user `languages.toml` and are
+// merged. The `grammar_selection` key of the config is then used to filter
+// down all grammars into a subset of the user's choosing.
+fn get_grammar_configs() -> Result<Vec<GrammarConfiguration>> {
+ let config: Configuration = crate::user_lang_config()
+ .context("Could not parse languages.toml")?
+ .try_into()?;
+
+ let grammars = match config.grammar_selection {
+ Some(GrammarSelection::Only(selections)) => config
+ .grammar
+ .into_iter()
+ .filter(|grammar| selections.contains(&grammar.grammar_id))
+ .collect(),
+ Some(GrammarSelection::Except(rejections)) => config
+ .grammar
+ .into_iter()
+ .filter(|grammar| !rejections.contains(&grammar.grammar_id))
+ .collect(),
+ None => config.grammar,
+ };
+
+ Ok(grammars)
+}
+
+fn run_parallel<F>(grammars: Vec<GrammarConfiguration>, job: F, action: &'static str) -> Result<()>
+where
+ F: Fn(GrammarConfiguration) -> Result<()> + std::marker::Send + 'static + Copy,
+{
+ let pool = threadpool::Builder::new().build();
+ let (tx, rx) = channel();
+
+ for grammar in grammars {
+ let tx = tx.clone();
+
+ pool.execute(move || {
+ tx.send(job(grammar)).unwrap();
+ });
+ }
+ pool.join();
+
+ // TODO: print all failures instead of the first one found.
+ if let Some(failure) = rx.try_iter().find_map(|result| result.err()) {
+ Err(anyhow!(
+ "Failed to {} some grammar(s).\n{}",
+ action,
+ failure
+ ))
+ } else {
+ Ok(())
+ }
+}
+
+fn fetch_grammar(grammar: GrammarConfiguration) -> Result<()> {
+ if let GrammarSource::Git {
+ remote, revision, ..
+ } = grammar.source
+ {
+ let grammar_dir = crate::runtime_dir()
+ .join("grammars/sources")
+ .join(&grammar.grammar_id);
+
+ fs::create_dir_all(&grammar_dir).context(format!(
+ "Could not create grammar directory {:?}",
+ grammar_dir
+ ))?;
+
+ // create the grammar dir contains a git directory
+ if !grammar_dir.join(".git").is_dir() {
+ git(&grammar_dir, ["init"])?;
+ }
+
+ // ensure the remote matches the configured remote
+ if get_remote_url(&grammar_dir).map_or(true, |s| s != remote) {
+ set_remote(&grammar_dir, &remote)?;
+ }
+
+ // ensure the revision matches the configured revision
+ if get_revision(&grammar_dir).map_or(true, |s| s != revision) {
+ // Fetch the exact revision from the remote.
+ // Supported by server-side git since v2.5.0 (July 2015),
+ // enabled by default on major git hosts.
+ git(&grammar_dir, ["fetch", REMOTE_NAME, &revision])?;
+ git(&grammar_dir, ["checkout", &revision])?;
+
+ println!(
+ "Grammar '{}' checked out at '{}'.",
+ grammar.grammar_id, revision
+ );
+ Ok(())
+ } else {
+ println!("Grammar '{}' is already up to date.", grammar.grammar_id);
+ Ok(())
+ }
+ } else {
+ println!("Skipping local grammar '{}'", grammar.grammar_id);
+ Ok(())
+ }
+}
+
+// Sets the remote for a repository to the given URL, creating the remote if
+// it does not yet exist.
+fn set_remote(repository_dir: &Path, remote_url: &str) -> Result<String> {
+ git(
+ repository_dir,
+ ["remote", "set-url", REMOTE_NAME, remote_url],
+ )
+ .or_else(|_| git(repository_dir, ["remote", "add", REMOTE_NAME, remote_url]))
+}
+
+fn get_remote_url(repository_dir: &Path) -> Option<String> {
+ git(repository_dir, ["remote", "get-url", REMOTE_NAME]).ok()
+}
+
+fn get_revision(repository_dir: &Path) -> Option<String> {
+ git(repository_dir, ["rev-parse", "HEAD"]).ok()
+}
+
+// A wrapper around 'git' commands which returns stdout in success and a
+// helpful error message showing the command, stdout, and stderr in error.
+fn git<I, S>(repository_dir: &Path, args: I) -> Result<String>
+where
+ I: IntoIterator<Item = S>,
+ S: AsRef<std::ffi::OsStr>,
+{
+ let output = Command::new("git")
+ .args(args)
+ .current_dir(repository_dir)
+ .output()?;
+
+ if output.status.success() {
+ Ok(String::from_utf8_lossy(&output.stdout)
+ .trim_end()
+ .to_owned())
+ } else {
+ // TODO: figure out how to display the git command using `args`
+ Err(anyhow!(
+ "Git command failed.\nStdout: {}\nStderr: {}",
+ String::from_utf8_lossy(&output.stdout),
+ String::from_utf8_lossy(&output.stderr),
+ ))
+ }
+}
+
+fn build_grammar(grammar: GrammarConfiguration) -> Result<()> {
+ println!("{:#?}", grammar);
+ let grammar_dir = if let GrammarSource::Local { path } = &grammar.source {
+ PathBuf::from(&path)
+ } else {
+ crate::runtime_dir()
+ .join("grammars/sources")
+ .join(&grammar.grammar_id)
+ };
+
+ let grammar_dir_entries = grammar_dir.read_dir().with_context(|| {
+ format!("Failed to read directory {grammar_dir:?}. Did you use 'hx --fetch-grammars'?")
+ })?;
+
+ if grammar_dir_entries.count() == 0 {
+ return Err(anyhow!(
+ "Directory {grammar_dir:?} is empty. Did you use 'hx --fetch-grammars'?"
+ ));
+ };
+
+ let path = match &grammar.source {
+ GrammarSource::Git {
+ subpath: Some(subpath),
+ ..
+ } => grammar_dir.join(subpath),
+ _ => grammar_dir,
+ }
+ .join("src");
+
+ build_tree_sitter_library(&path, grammar)
+}
+
+fn build_tree_sitter_library(src_path: &Path, grammar: GrammarConfiguration) -> Result<()> {
+ let header_path = src_path;
+ let parser_path = src_path.join("parser.c");
+ let mut scanner_path = src_path.join("scanner.c");
+
+ let scanner_path = if scanner_path.exists() {
+ Some(scanner_path)
+ } else {
+ scanner_path.set_extension("cc");
+ if scanner_path.exists() {
+ Some(scanner_path)
+ } else {
+ None
+ }
+ };
+ let parser_lib_path = crate::runtime_dir().join("grammars");
+ let mut library_path = parser_lib_path.join(&grammar.grammar_id);
+ library_path.set_extension(DYLIB_EXTENSION);
+
+ let recompile = needs_recompile(&library_path, &parser_path, &scanner_path)
+ .context("Failed to compare source and binary timestamps")?;
+
+ if !recompile {
+ println!("Grammar '{}' is already built.", grammar.grammar_id);
+ return Ok(());
+ }
+
+ println!("Building grammar '{}'", grammar.grammar_id);
+
+ let mut config = cc::Build::new();
+ config
+ .cpp(true)
+ .opt_level(3)
+ .cargo_metadata(false)
+ .host(BUILD_TARGET)
+ .target(BUILD_TARGET);
+ let compiler = config.get_compiler();
+ let mut command = Command::new(compiler.path());
+ command.current_dir(src_path);
+ for (key, value) in compiler.env() {
+ command.env(key, value);
+ }
+
+ if cfg!(windows) {
+ command
+ .args(&["/nologo", "/LD", "/I"])
+ .arg(header_path)
+ .arg("/Od")
+ .arg("/utf-8");
+ if let Some(scanner_path) = scanner_path.as_ref() {
+ command.arg(scanner_path);
+ }
+
+ command
+ .arg(parser_path)
+ .arg("/link")
+ .arg(format!("/out:{}", library_path.to_str().unwrap()));
+ } else {
+ command
+ .arg("-shared")
+ .arg("-fPIC")
+ .arg("-fno-exceptions")
+ .arg("-g")
+ .arg("-I")
+ .arg(header_path)
+ .arg("-o")
+ .arg(&library_path)
+ .arg("-O3");
+ if let Some(scanner_path) = scanner_path.as_ref() {
+ if scanner_path.extension() == Some("c".as_ref()) {
+ command.arg("-xc").arg("-std=c99").arg(scanner_path);
+ } else {
+ command.arg(scanner_path);
+ }
+ }
+ command.arg("-xc").arg(parser_path);
+ if cfg!(all(unix, not(target_os = "macos"))) {
+ command.arg("-Wl,-z,relro,-z,now");
+ }
+ }
+
+ let output = command.output().context("Failed to execute C compiler")?;
+ if !output.status.success() {
+ return Err(anyhow!(
+ "Parser compilation failed.\nStdout: {}\nStderr: {}",
+ String::from_utf8_lossy(&output.stdout),
+ String::from_utf8_lossy(&output.stderr)
+ ));
+ }
+
+ Ok(())
+}
+
+fn needs_recompile(
+ lib_path: &Path,
+ parser_c_path: &Path,
+ scanner_path: &Option<PathBuf>,
+) -> Result<bool> {
+ if !lib_path.exists() {
+ return Ok(true);
+ }
+ let lib_mtime = mtime(lib_path)?;
+ if mtime(parser_c_path)? > lib_mtime {
+ return Ok(true);
+ }
+ if let Some(scanner_path) = scanner_path {
+ if mtime(scanner_path)? > lib_mtime {
+ return Ok(true);
+ }
+ }
+ Ok(false)
+}
+
+fn mtime(path: &Path) -> Result<SystemTime> {
+ Ok(fs::metadata(path)?.modified()?)
+}
+
+/// Gives the contents of a file from a language's `runtime/queries/<lang>`
+/// directory
+pub fn load_runtime_file(language: &str, filename: &str) -> Result<String, std::io::Error> {
+ let path = crate::RUNTIME_DIR
+ .join("queries")
+ .join(language)
+ .join(filename);
+ std::fs::read_to_string(&path)
+}
diff --git a/helix-loader/src/lib.rs b/helix-loader/src/lib.rs
new file mode 100644
index 00000000..a2c4d96f
--- /dev/null
+++ b/helix-loader/src/lib.rs
@@ -0,0 +1,161 @@
+pub mod grammar;
+
+use etcetera::base_strategy::{choose_base_strategy, BaseStrategy};
+
+pub static RUNTIME_DIR: once_cell::sync::Lazy<std::path::PathBuf> =
+ once_cell::sync::Lazy::new(runtime_dir);
+
+pub fn runtime_dir() -> std::path::PathBuf {
+ if let Ok(dir) = std::env::var("HELIX_RUNTIME") {
+ return dir.into();
+ }
+
+ const RT_DIR: &str = "runtime";
+ let conf_dir = config_dir().join(RT_DIR);
+ if conf_dir.exists() {
+ return conf_dir;
+ }
+
+ if let Ok(dir) = std::env::var("CARGO_MANIFEST_DIR") {
+ // this is the directory of the crate being run by cargo, we need the workspace path so we take the parent
+ return std::path::PathBuf::from(dir).parent().unwrap().join(RT_DIR);
+ }
+
+ // fallback to location of the executable being run
+ std::env::current_exe()
+ .ok()
+ .and_then(|path| path.parent().map(|path| path.to_path_buf().join(RT_DIR)))
+ .unwrap()
+}
+
+pub fn config_dir() -> std::path::PathBuf {
+ // TODO: allow env var override
+ let strategy = choose_base_strategy().expect("Unable to find the config directory!");
+ let mut path = strategy.config_dir();
+ path.push("helix");
+ path
+}
+
+pub fn cache_dir() -> std::path::PathBuf {
+ // TODO: allow env var override
+ let strategy = choose_base_strategy().expect("Unable to find the config directory!");
+ let mut path = strategy.cache_dir();
+ path.push("helix");
+ path
+}
+
+pub fn config_file() -> std::path::PathBuf {
+ config_dir().join("config.toml")
+}
+
+pub fn lang_config_file() -> std::path::PathBuf {
+ config_dir().join("languages.toml")
+}
+
+pub fn log_file() -> std::path::PathBuf {
+ cache_dir().join("helix.log")
+}
+
+/// Default bultin-in languages.toml.
+pub fn default_lang_config() -> toml::Value {
+ toml::from_slice(include_bytes!("../../languages.toml"))
+ .expect("Could not parse bultin-in languages.toml to valid toml")
+}
+
+/// User configured languages.toml file, merged with the default config.
+pub fn user_lang_config() -> Result<toml::Value, toml::de::Error> {
+ let def_lang_conf = default_lang_config();
+ let data = std::fs::read(crate::config_dir().join("languages.toml"));
+ let user_lang_conf = match data {
+ Ok(raw) => {
+ let value = toml::from_slice(&raw)?;
+ merge_toml_values(def_lang_conf, value)
+ }
+ Err(_) => def_lang_conf,
+ };
+
+ Ok(user_lang_conf)
+}
+
+// right overrides left
+pub fn merge_toml_values(left: toml::Value, right: toml::Value) -> toml::Value {
+ use toml::Value;
+
+ fn get_name(v: &Value) -> Option<&str> {
+ v.get("name").and_then(Value::as_str)
+ }
+
+ match (left, right) {
+ (Value::Array(mut left_items), Value::Array(right_items)) => {
+ left_items.reserve(right_items.len());
+ for rvalue in right_items {
+ let lvalue = get_name(&rvalue)
+ .and_then(|rname| left_items.iter().position(|v| get_name(v) == Some(rname)))
+ .map(|lpos| left_items.remove(lpos));
+ let mvalue = match lvalue {
+ Some(lvalue) => merge_toml_values(lvalue, rvalue),
+ None => rvalue,
+ };
+ left_items.push(mvalue);
+ }
+ Value::Array(left_items)
+ }
+ (Value::Table(mut left_map), Value::Table(right_map)) => {
+ for (rname, rvalue) in right_map {
+ match left_map.remove(&rname) {
+ Some(lvalue) => {
+ let merged_value = merge_toml_values(lvalue, rvalue);
+ left_map.insert(rname, merged_value);
+ }
+ None => {
+ left_map.insert(rname, rvalue);
+ }
+ }
+ }
+ Value::Table(left_map)
+ }
+ // Catch everything else we didn't handle, and use the right value
+ (_, value) => value,
+ }
+}
+
+#[cfg(test)]
+mod merge_toml_tests {
+ use super::merge_toml_values;
+
+ #[test]
+ fn language_tomls() {
+ use toml::Value;
+
+ const USER: &str = "
+ [[language]]
+ name = \"nix\"
+ test = \"bbb\"
+ indent = { tab-width = 4, unit = \" \", test = \"aaa\" }
+ ";
+
+ let base: Value = toml::from_slice(include_bytes!("../../languages.toml"))
+ .expect("Couldn't parse built-in languages config");
+ let user: Value = toml::from_str(USER).unwrap();
+
+ let merged = merge_toml_values(base, user);
+ let languages = merged.get("language").unwrap().as_array().unwrap();
+ let nix = languages
+ .iter()
+ .find(|v| v.get("name").unwrap().as_str().unwrap() == "nix")
+ .unwrap();
+ let nix_indent = nix.get("indent").unwrap();
+
+ // We changed tab-width and unit in indent so check them if they are the new values
+ assert_eq!(
+ nix_indent.get("tab-width").unwrap().as_integer().unwrap(),
+ 4
+ );
+ assert_eq!(nix_indent.get("unit").unwrap().as_str().unwrap(), " ");
+ // We added a new keys, so check them
+ assert_eq!(nix.get("test").unwrap().as_str().unwrap(), "bbb");
+ assert_eq!(nix_indent.get("test").unwrap().as_str().unwrap(), "aaa");
+ // We didn't change comment-token so it should be same
+ assert_eq!(nix.get("comment-token").unwrap().as_str().unwrap(), "#");
+ }
+}