git.delta.rocks / jrsonnet / refs/commits / 3627c6c6df00

difftreelog

feat nixos-install target

Lach2025-04-06parent: #3972fee.patch.diff
in: trunk

6 files changed

modifiedCargo.lockdiffbeforeafterboth
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -924,7 +924,6 @@
  "hostname",
  "human-repr",
  "indicatif",
- "indoc",
  "itertools 0.13.0",
  "nix-eval",
  "nixlike",
@@ -958,6 +957,7 @@
  "fleet-shared",
  "futures",
  "hostname",
+ "indoc",
  "itertools 0.13.0",
  "nix-eval",
  "nixlike",
modifiedcmds/fleet/Cargo.tomldiffbeforeafterboth
--- a/cmds/fleet/Cargo.toml
+++ b/cmds/fleet/Cargo.toml
@@ -47,7 +47,6 @@
 nix-eval.workspace = true
 nom = "7.1.3"
 fleet-base = { version = "0.1.0", path = "../../crates/fleet-base" }
-indoc = "2.0.6"
 
 [features]
 default = ["indicatif"]
modifiedcmds/fleet/src/cmds/build_systems.rsdiffbeforeafterboth
after · cmds/fleet/src/cmds/build_systems.rs
1use std::{env::current_dir, os::unix::fs::symlink, path::PathBuf, time::Duration};23use anyhow::{anyhow, bail, Result};4use clap::{Parser, ValueEnum};5use fleet_base::{6	host::{Config, ConfigHost, DeployKind},7	opts::FleetOpts,8};9use itertools::Itertools as _;10use nix_eval::{nix_go, NixBuildBatch};11use tokio::{task::LocalSet, time::sleep};12use tracing::{error, field, info, info_span, warn, Instrument};1314#[derive(Parser)]15pub struct Deploy {16	/// Disable automatic rollback17	#[clap(long)]18	disable_rollback: bool,19	/// Action to execute after system is built20	action: DeployAction,21}2223#[derive(ValueEnum, Clone, Copy)]24enum DeployAction {25	/// Upload derivation, but do not execute the update.26	Upload,27	/// Upload and execute the activation script, old version will be used after reboot.28	Test,29	/// Upload and set as current system profile, but do not execute activation script.30	Boot,31	/// Upload, set current profile, and execute activation script.32	Switch,33}3435impl DeployAction {36	pub(crate) fn name(&self) -> Option<&'static str> {37		match self {38			Self::Upload => None,39			Self::Test => Some("test"),40			Self::Boot => Some("boot"),41			Self::Switch => Some("switch"),42		}43	}44	pub(crate) fn should_switch_profile(&self) -> bool {45		matches!(self, Self::Switch | Self::Boot)46	}47	pub(crate) fn should_activate(&self) -> bool {48		matches!(self, Self::Switch | Self::Test | Self::Boot)49	}50	pub(crate) fn should_create_rollback_marker(&self) -> bool {51		// Upload does nothing on the target machine, other than uploading the closure.52		// In boot case we want to have rollback marker prepared, so that the system may rollback itself on the next boot.53		!matches!(self, Self::Upload)54	}55	pub(crate) fn should_schedule_rollback_run(&self) -> bool {56		matches!(self, Self::Switch | Self::Test)57	}58}5960#[derive(Parser, Clone)]61pub struct BuildSystems {62	/// Attribute to build. Systems are deployed from "toplevel" attr, well-known used attributes63	/// are "sdImage"/"isoImage", and your configuration may include any other build attributes.64	#[clap(long, default_value = "toplevel")]65	build_attr: String,66}6768struct Generation {69	id: u32,70	current: bool,71	datetime: String,72}7374fn parse_generation_line(g: &str) -> Option<Generation> {75	let mut parts = g.split_whitespace();76	let id = parts.next()?;77	let id: u32 = id.parse().ok()?;78	let date = parts.next()?;79	let time = parts.next()?;80	let current = if let Some(current) = parts.next() {81		if current == "(current)" {82			Some(true)83		} else {84			None85		}86	} else {87		Some(false)88	};89	let current = current?;90	if parts.next().is_some() {91		warn!("unexpected text after generation: {g}");92	}93	Some(Generation {94		id,95		current,96		datetime: format!("{date} {time}"),97	})98}99100async fn get_current_generation(host: &ConfigHost) -> Result<Generation> {101	let mut cmd = host.cmd("nix-env").await?;102	cmd.comparg("--profile", "/nix/var/nix/profiles/system")103		.arg("--list-generations");104	// Sudo is required due to --list-generations acquiring lock on the profile.105	let data = cmd.sudo().run_string().await?;106	let generations = data107		.split('\n')108		.map(|e| e.trim())109		.filter(|&l| !l.is_empty())110		.filter_map(|g| {111			let gen = parse_generation_line(g);112			if gen.is_none() {113				warn!("bad generation: {g}");114			}115			gen116		})117		.collect::<Vec<_>>();118	let current = generations119		.into_iter()120		.filter(|g| g.current)121		.at_most_one()122		.map_err(|_e| anyhow!("bad list-generations output"))?123		.ok_or_else(|| anyhow!("failed to find generation"))?;124	Ok(current)125}126127async fn deploy_task(128	action: DeployAction,129	host: &ConfigHost,130	built: PathBuf,131	specialisation: Option<String>,132	disable_rollback: bool,133) -> Result<()> {134	let deploy_kind = host.deploy_kind().await?;135	if deploy_kind == DeployKind::NixosInstall136		&& !matches!(action, DeployAction::Boot | DeployAction::Upload)137	{138		bail!("nixos-install deploy kind only supports boot and upload actions");139	}140141	let mut failed = false;142143	// TODO: Lockfile, to prevent concurrent system switch?144	// TODO: If rollback target exists - bail, it should be removed. Lockfile will not work in case if rollback145	// is scheduler on next boot (default behavior). On current boot - rollback activator will fail due to146	// unit name conflict in systemd-run147	// This code is tied to rollback.nix148	if !disable_rollback && action.should_create_rollback_marker() {149		let _span = info_span!("preparing").entered();150		info!("preparing for rollback");151		let generation = get_current_generation(host).await?;152		info!(153			"rollback target would be {} {}",154			generation.id, generation.datetime155		);156		{157			let mut cmd = host.cmd("sh").await?;158			cmd.arg("-c").arg(format!("mark=$(mktemp -p /etc -t fleet_rollback_marker.XXXXX) && echo -n {} > $mark && mv --no-clobber $mark /etc/fleet_rollback_marker", generation.id));159			if let Err(e) = cmd.sudo().run().await {160				error!("failed to set rollback marker: {e}");161				failed = true;162			}163		}164		// Activation script also starts rollback-watchdog.timer, however, it is possible that it won't be started.165		// Kicking it on manually will work best.166		//167		// There wouldn't be conflict, because here we trigger start of the primary service, and systemd will168		// only allow one instance of it.169170		// TODO: We should also watch how this process is going.171		// After running this command, we have less than 3 minutes to deploy everything,172		// if we fail to perform generation switch in time, then we will still call the activation script, and this may break something.173		// Anyway, reboot will still help in this case.174		if action.should_schedule_rollback_run() {175			let mut cmd = host.cmd("systemd-run").await?;176			cmd.comparg("--on-active", "3min")177				.comparg("--unit", "rollback-watchdog-run")178				.arg("systemctl")179				.arg("start")180				.arg("rollback-watchdog.service");181			if let Err(e) = cmd.sudo().run().await {182				error!("failed to schedule rollback run: {e}");183				failed = true;184			}185		}186	}187	if deploy_kind == DeployKind::NixosInstall {188		info!(189			"running nixos-install to switch profile, install bootloader, and perform activation"190		);191		let mut cmd = host.cmd("nixos-install").await?;192		cmd.arg("--system").arg(&built).args([193			// Channels here aren't fleet host system channels, but channels embedded in installation cd, which might be old.194			// It is possible to copy host channels, but I would prefer non-flake nix just to be unsupported.195			"--no-channel-copy",196			"--root",197			"/mnt",198		]);199		if let Err(e) = cmd.sudo().run().await {200			error!("failed to execute nixos-install: {e}");201			failed = true;202		}203	} else {204		if action.should_switch_profile() && !failed {205			info!("switching system profile generation");206207			// To avoid even more problems, using nixos-install for now.208			// // nix build is unable to work with --store argument for some reason, and nix until 2.26 didn't support copy with --profile argument,209			// // falling back to using nix-env command210			// // After stable NixOS starts using 2.26 - use `nix --store /mnt copy --from /mnt --profile ...` here, and instead of nix build below.211			// let mut cmd = host.cmd("nix-env").await?;212			// cmd.args([213			// 	"--store",214			// 	"/mnt",215			// 	"--profile",216			// 	"/mnt/nix/var/nix/profiles/system",217			// 	"--set",218			// ])219			// .arg(&built);220			// if let Err(e) = cmd.sudo().run_nix().await {221			// 	error!("failed to switch system profile generation: {e}");222			// 	failed = true;223			// }224			// It would also be possible to update profile atomically during copy:225			// https://github.com/NixOS/nix/pull/11657226			let mut cmd = host.nix_cmd().await?;227			cmd.arg("build");228			cmd.comparg("--profile", "/nix/var/nix/profiles/system");229			cmd.arg(&built);230			if let Err(e) = cmd.sudo().run_nix().await {231				error!("failed to switch system profile generation: {e}");232				failed = true;233			}234		}235236		// FIXME: Connection might be disconnected after activation run237238		if action.should_activate() && !failed {239			let _span = info_span!("activating").entered();240			info!("executing activation script");241			let specialised = if let Some(specialisation) = specialisation {242				let mut specialised = built.join("specialisation");243				specialised.push(specialisation);244				specialised245			} else {246				built.clone()247			};248			let switch_script = specialised.join("bin/switch-to-configuration");249			let mut cmd = host.cmd(switch_script).in_current_span().await?;250			cmd.arg(action.name().expect("upload.should_activate == false"));251			if let Err(e) = cmd.sudo().run().in_current_span().await {252				error!("failed to activate: {e}");253				failed = true;254			}255		}256	}257	if action.should_create_rollback_marker() {258		if !disable_rollback {259			if failed {260				if action.should_schedule_rollback_run() {261					info!("executing rollback");262					if let Err(e) = host263						.systemctl_start("rollback-watchdog.service")264						.instrument(info_span!("rollback"))265						.await266					{267						error!("failed to trigger rollback: {e}")268					}269				}270			} else {271				info!("trying to mark upgrade as successful");272				if let Err(e) = host273					.rm_file("/etc/fleet_rollback_marker", true)274					.in_current_span()275					.await276				{277					error!("failed to remove rollback marker. This is bad, as the system will be rolled back by watchdog: {e}")278				}279			}280			info!("disarming watchdog, just in case");281			if let Err(_e) = host.systemctl_stop("rollback-watchdog.timer").await {282				// It is ok, if there was no reboot - then timer might not be running.283			}284			if action.should_schedule_rollback_run() {285				if let Err(e) = host.systemctl_stop("rollback-watchdog-run.timer").await {286					error!("failed to disarm rollback run: {e}");287				}288			}289		} else if let Err(_e) = host290			.rm_file("/etc/fleet_rollback_marker", true)291			.in_current_span()292			.await293		{294			// Marker might not exist, yet better try to remove it.295		}296	}297	Ok(())298}299300async fn build_task(301	config: Config,302	hostname: String,303	build_attr: &str,304	batch: Option<NixBuildBatch>,305) -> Result<PathBuf> {306	info!("building");307	let host = config.host(&hostname).await?;308	// let action = Action::from(self.subcommand.clone());309	let nixos = host.nixos_config().await?;310	let drv = nix_go!(nixos.system.build[{ build_attr }]);311	let outputs = drv.build_maybe_batch(batch).await?;312	let out_output = outputs313		.get("out")314		.ok_or_else(|| anyhow!("system build should produce \"out\" output"))?;315316	{317		info!("adding gc root");318		let mut cmd = config.local_host().cmd("nix").await?;319		cmd.arg("build")320			.comparg(321				"--profile",322				format!(323					"/nix/var/nix/profiles/{}-{hostname}",324					config.data().gc_root_prefix325				),326			)327			.arg(out_output);328		cmd.sudo().run_nix().await?;329	}330331	Ok(out_output.clone())332}333334impl BuildSystems {335	pub async fn run(self, config: &Config, opts: &FleetOpts) -> Result<()> {336		let hosts = opts.filter_skipped(config.list_hosts().await?).await?;337		let set = LocalSet::new();338		let build_attr = self.build_attr.clone();339		let batch = (hosts.len() > 1).then(|| {340			config341				.nix_session342				.new_build_batch("build-hosts".to_string())343		});344		for host in hosts {345			let config = config.clone();346			let span = info_span!("build", host = field::display(&host.name));347			let hostname = host.name;348			let build_attr = build_attr.clone();349			let batch = batch.clone();350			set.spawn_local(351				(async move {352					let built = match build_task(config, hostname.clone(), &build_attr, batch).await353					{354						Ok(path) => path,355						Err(e) => {356							error!("failed to deploy host: {}", e);357							return;358						}359					};360					// TODO: Handle error361					let mut out = current_dir().expect("cwd exists");362					out.push(format!("built-{}", hostname));363364					info!("linking iso image to {:?}", out);365					if let Err(e) = symlink(built, out) {366						error!("failed to symlink: {e}")367					}368				})369				.instrument(span),370			);371		}372		drop(batch);373		set.await;374		Ok(())375	}376}377378impl Deploy {379	pub async fn run(self, config: &Config, opts: &FleetOpts) -> Result<()> {380		let hosts = opts.filter_skipped(config.list_hosts().await?).await?;381		let set = LocalSet::new();382		let batch = (hosts.len() > 1).then(|| {383			config384				.nix_session385				.new_build_batch("deploy-hosts".to_string())386		});387		for host in hosts.into_iter() {388			let config = config.clone();389			let span = info_span!("deploy", host = field::display(&host.name));390			let hostname = host.name.clone();391			let local_host = config.local_host();392			let opts = opts.clone();393			let batch = batch.clone();394			if let Some(deploy_kind) = opts.action_attr::<DeployKind>(&host, "deploy_kind").await? {395				host.set_deploy_kind(deploy_kind);396			};397398			set.spawn_local(399				(async move {400					let built =401						match build_task(config.clone(), hostname.clone(), "toplevel", batch).await402						{403							Ok(path) => path,404							Err(e) => {405								error!("failed to build host system closure: {}", e);406								return;407							}408						};409410					let deploy_kind = match host.deploy_kind().await {411						Ok(v) => v,412						Err(e) => {413							error!("failed to query target deploy kind: {e}");414							return;415						}416					};417418					// TODO: Make disable_rollback a host attribute instead419					let mut disable_rollback = self.disable_rollback;420					if !disable_rollback && deploy_kind != DeployKind::Fleet {421						warn!("disabling rollback, as not supported by non-fleet deployment kinds");422						disable_rollback = true;423					}424425					if !opts.is_local(&hostname) {426						info!("uploading system closure");427						{428							// TODO: Move to remote_derivation method.429							// Alternatively, nix store make-content-addressed can be used,430							// at least for the first deployment, to provide trusted store key.431							//432							// It is much slower, yet doesn't require root on the deployer machine.433							let Ok(mut sign) = local_host.cmd("nix").await else {434								error!("failed to setup local");435								return;436							};437							// Private key for host machine is registered in nix-sign.nix438							sign.arg("store")439								.arg("sign")440								.comparg("--key-file", "/etc/nix/private-key")441								.arg("-r")442								.arg(&built);443							if let Err(e) = sign.sudo().run_nix().await {444								warn!("failed to sign store paths: {e}");445							};446						}447						let mut tries = 0;448						loop {449							match host.remote_derivation(&built).await {450								Ok(remote) => {451									assert!(remote == built, "CA derivations aren't implemented");452									break;453								}454								Err(e) if tries < 3 => {455									tries += 1;456									warn!("copy failure ({}/3): {}", tries, e);457									sleep(Duration::from_millis(5000)).await;458								}459								Err(e) => {460									error!("upload failed: {e}");461									return;462								}463							}464						}465					}466					if let Err(e) = deploy_task(467						self.action,468						&host,469						built,470						if let Ok(v) = opts.action_attr(&host, "specialisation").await {471							v472						} else {473							error!("unreachable? failed to get specialization");474							return;475						},476						disable_rollback,477					)478					.await479					{480						error!("activation failed: {e}");481					}482				})483				.instrument(span),484			);485		}486		drop(batch);487		set.await;488		Ok(())489	}490}
modifiedcrates/fleet-base/Cargo.tomldiffbeforeafterboth
--- a/crates/fleet-base/Cargo.toml
+++ b/crates/fleet-base/Cargo.toml
@@ -13,6 +13,7 @@
 fleet-shared.workspace = true
 futures = "0.3.30"
 hostname = "0.4.0"
+indoc = "2.0.6"
 itertools = "0.13.0"
 nix-eval.workspace = true
 nixlike.workspace = true
modifiedcrates/fleet-base/src/host.rsdiffbeforeafterboth
--- a/crates/fleet-base/src/host.rs
+++ b/crates/fleet-base/src/host.rs
@@ -58,11 +58,35 @@
 	Su,
 }
 
+#[derive(Clone, PartialEq, Copy)]
+pub enum DeployKind {
+	/// NixOS => NixOS managed by fleet
+	UpgradeToFleet,
+	/// NixOS managed by fleet => NixOS managed by fleet
+	Fleet,
+	/// Remote host has /mnt, /mnt/boot mounted,
+	/// generated config is added to fleet configuration.
+	NixosInstall,
+}
+
+impl FromStr for DeployKind {
+	type Err = anyhow::Error;
+	fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
+		match s {
+			"upgrade-to-fleet" => Ok(Self::UpgradeToFleet),
+			"fleet" => Ok(Self::Fleet),
+			"nixos-install" => Ok(Self::NixosInstall),
+			v => bail!("unknown deploy_kind: {v}; expected on of \"upgrade-to-fleet\", \"fleet\", \"nixos-install\""),
+		}
+	}
+}
 pub struct ConfigHost {
 	config: Config,
 	pub name: String,
 	groups: OnceCell<Vec<String>>,
 
+	deploy_kind: OnceCell<DeployKind>,
+
 	pub host_config: Option<Value>,
 	pub nixos_config: OnceCell<Value>,
 	pub pkgs_override: Option<Value>,
@@ -73,6 +97,40 @@
 }
 // TODO: Move command helpers away with connectivity refactor
 impl ConfigHost {
+	pub fn set_deploy_kind(&self, kind: DeployKind) {
+		self.deploy_kind
+			.set(kind)
+			.ok()
+			.expect("deploy kind is already set");
+	}
+	pub async fn deploy_kind(&self) -> Result<DeployKind> {
+		if let Some(kind) = self.deploy_kind.get() {
+			return Ok(kind.clone());
+		}
+		let is_fleet_managed = match self.file_exists("/etc/FLEET_HOST").await {
+			Ok(v) => v,
+			Err(e) => {
+				bail!("failed to query remote system kind: {}", e);
+			}
+		};
+		if !is_fleet_managed {
+			bail!(indoc::indoc! {"
+				host is not marked as managed by fleet
+				if you're not trying to lustrate/install system from scratch,
+				you should either
+					1. manually create /etc/FLEET_HOST file on the target host,
+					2. use ?deploy_kind=fleet host argument if you're upgrading from older version of fleet
+					3. use ?deploy_kind=upgrade_to_fleet if you're upgrading from plain nixos to fleet-managed nixos
+			"});
+		}
+		// TOCTOU is possible
+		let _ = self.deploy_kind.set(DeployKind::Fleet);
+		Ok(self
+			.deploy_kind
+			.get()
+			.expect("deploy kind is just set")
+			.clone())
+	}
 	pub async fn escalation_strategy(&self) -> Result<EscalationStrategy> {
 		// Prefer sudo, as run0 has some gotchas with polkit
 		// and too many repeating prompts.
@@ -189,6 +247,16 @@
 			Ok(MyCommand::new_on(escalation, cmd, session))
 		}
 	}
+	pub async fn nix_cmd(&self) -> Result<MyCommand> {
+		let mut nix = self.cmd("nix").await?;
+		nix.args([
+			"--extra-experimental-features",
+			"nix-command",
+			"--extra-experimental-features",
+			"flakes",
+		]);
+		Ok(nix)
+	}
 
 	pub async fn decrypt(&self, data: SecretData) -> Result<Vec<u8>> {
 		ensure!(data.encrypted, "secret is not encrypted");
@@ -231,10 +299,23 @@
 			EscalationStrategy::Su,
 			"nix",
 		);
-		nix.arg("copy")
-			.arg("--substitute-on-destination")
-			.comparg("--to", format!("ssh-ng://{}", self.name))
-			.arg(path);
+		nix.arg("copy").arg("--substitute-on-destination");
+
+		match self.deploy_kind().await? {
+			DeployKind::Fleet | DeployKind::UpgradeToFleet => {
+				nix.comparg("--to", format!("ssh-ng://{}", self.name));
+			}
+			DeployKind::NixosInstall => {
+				nix
+					// Signature checking makes no sense with remote-store store argument set, as we're not even interacting with remote nix daemon
+					.arg("--no-check-sigs")
+					.comparg(
+						"--to",
+						format!("ssh-ng://root@{}-install?remote-store=/mnt", self.name),
+					);
+			}
+		}
+		nix.arg(path);
 		nix.run_nix().await.context("nix copy")?;
 		Ok(path.to_owned())
 	}
@@ -354,6 +435,7 @@
 
 			local: true,
 			session: OnceLock::new(),
+			deploy_kind: OnceCell::new(),
 		}
 	}
 
@@ -372,6 +454,7 @@
 			// TODO: Remove with connectivit refactor
 			local: self.localhost == name,
 			session: OnceLock::new(),
+			deploy_kind: OnceCell::new(),
 		})
 	}
 	pub async fn list_hosts(&self) -> Result<Vec<ConfigHost>> {
modifiedmodules/nixos/meta.nixdiffbeforeafterboth
--- a/modules/nixos/meta.nix
+++ b/modules/nixos/meta.nix
@@ -13,5 +13,13 @@
   ];
 
   # Version of environment (fleet scripts such as rollback) already installed on the host
-  config.environment.etc.FLEET_HOST.text = "1";
+  config = {
+    environment.etc.FLEET_HOST.text = "1";
+
+    # Flake/nix command support is assumed by fleet, lets add it here to avoid potential problems.
+    nix.settings.experimental-features = [
+      "nix-command"
+      "flakes"
+    ];
+  };
 }