git.delta.rocks / jrsonnet / refs/commits / 33e3a6cc33fd

difftreelog

feat basic lustration helper

Lach2025-04-24parent: #bd11592.patch.diff
in: trunk

3 files changed

modifiedcmds/fleet/src/cmds/build_systems.rsdiffbeforeafterboth
before · cmds/fleet/src/cmds/build_systems.rs
1use std::{env::current_dir, os::unix::fs::symlink, path::PathBuf, time::Duration};23use anyhow::{anyhow, bail, Result};4use clap::{Parser, ValueEnum};5use fleet_base::{6	host::{Config, ConfigHost, DeployKind},7	opts::FleetOpts,8};9use itertools::Itertools as _;10use nix_eval::{nix_go, NixBuildBatch};11use tokio::{task::LocalSet, time::sleep};12use tracing::{error, field, info, info_span, warn, Instrument};1314#[derive(Parser)]15pub struct Deploy {16	/// Disable automatic rollback17	#[clap(long)]18	disable_rollback: bool,19	/// Action to execute after system is built20	action: DeployAction,21}2223#[derive(ValueEnum, Clone, Copy)]24enum DeployAction {25	/// Upload derivation, but do not execute the update.26	Upload,27	/// Upload and execute the activation script, old version will be used after reboot.28	Test,29	/// Upload and set as current system profile, but do not execute activation script.30	Boot,31	/// Upload, set current profile, and execute activation script.32	Switch,33}3435impl DeployAction {36	pub(crate) fn name(&self) -> Option<&'static str> {37		match self {38			Self::Upload => None,39			Self::Test => Some("test"),40			Self::Boot => Some("boot"),41			Self::Switch => Some("switch"),42		}43	}44	pub(crate) fn should_switch_profile(&self) -> bool {45		matches!(self, Self::Switch | Self::Boot)46	}47	pub(crate) fn should_activate(&self) -> bool {48		matches!(self, Self::Switch | Self::Test | Self::Boot)49	}50	pub(crate) fn should_create_rollback_marker(&self) -> bool {51		// Upload does nothing on the target machine, other than uploading the closure.52		// In boot case we want to have rollback marker prepared, so that the system may rollback itself on the next boot.53		!matches!(self, Self::Upload)54	}55	pub(crate) fn should_schedule_rollback_run(&self) -> bool {56		matches!(self, Self::Switch | Self::Test)57	}58}5960#[derive(Parser, Clone)]61pub struct BuildSystems {62	/// Attribute to build. Systems are deployed from "toplevel" attr, well-known used attributes63	/// are "sdImage"/"isoImage", and your configuration may include any other build attributes.64	#[clap(long, default_value = "toplevel")]65	build_attr: String,66}6768struct Generation {69	id: u32,70	current: bool,71	datetime: String,72}7374fn parse_generation_line(g: &str) -> Option<Generation> {75	let mut parts = g.split_whitespace();76	let id = parts.next()?;77	let id: u32 = id.parse().ok()?;78	let date = parts.next()?;79	let time = parts.next()?;80	let current = if let Some(current) = parts.next() {81		if current == "(current)" {82			Some(true)83		} else {84			None85		}86	} else {87		Some(false)88	};89	let current = current?;90	if parts.next().is_some() {91		warn!("unexpected text after generation: {g}");92	}93	Some(Generation {94		id,95		current,96		datetime: format!("{date} {time}"),97	})98}99100async fn get_current_generation(host: &ConfigHost) -> Result<Generation> {101	let mut cmd = host.cmd("nix-env").await?;102	cmd.comparg("--profile", "/nix/var/nix/profiles/system")103		.arg("--list-generations");104	// Sudo is required due to --list-generations acquiring lock on the profile.105	let data = cmd.sudo().run_string().await?;106	let generations = data107		.split('\n')108		.map(|e| e.trim())109		.filter(|&l| !l.is_empty())110		.filter_map(|g| {111			let gen = parse_generation_line(g);112			if gen.is_none() {113				warn!("bad generation: {g}");114			}115			gen116		})117		.collect::<Vec<_>>();118	let current = generations119		.into_iter()120		.filter(|g| g.current)121		.at_most_one()122		.map_err(|_e| anyhow!("bad list-generations output"))?123		.ok_or_else(|| anyhow!("failed to find generation"))?;124	Ok(current)125}126127async fn deploy_task(128	action: DeployAction,129	host: &ConfigHost,130	built: PathBuf,131	specialisation: Option<String>,132	disable_rollback: bool,133) -> Result<()> {134	let deploy_kind = host.deploy_kind().await?;135	if deploy_kind == DeployKind::NixosInstall136		&& !matches!(action, DeployAction::Boot | DeployAction::Upload)137	{138		bail!("nixos-install deploy kind only supports boot and upload actions");139	}140141	let mut failed = false;142143	// TODO: Lockfile, to prevent concurrent system switch?144	// TODO: If rollback target exists - bail, it should be removed. Lockfile will not work in case if rollback145	// is scheduler on next boot (default behavior). On current boot - rollback activator will fail due to146	// unit name conflict in systemd-run147	// This code is tied to rollback.nix148	if !disable_rollback && action.should_create_rollback_marker() {149		let _span = info_span!("preparing").entered();150		info!("preparing for rollback");151		let generation = get_current_generation(host).await?;152		info!(153			"rollback target would be {} {}",154			generation.id, generation.datetime155		);156		{157			let mut cmd = host.cmd("sh").await?;158			cmd.arg("-c").arg(format!("mark=$(mktemp -p /etc -t fleet_rollback_marker.XXXXX) && echo -n {} > $mark && mv --no-clobber $mark /etc/fleet_rollback_marker", generation.id));159			if let Err(e) = cmd.sudo().run().await {160				error!("failed to set rollback marker: {e}");161				failed = true;162			}163		}164		// Activation script also starts rollback-watchdog.timer, however, it is possible that it won't be started.165		// Kicking it on manually will work best.166		//167		// There wouldn't be conflict, because here we trigger start of the primary service, and systemd will168		// only allow one instance of it.169170		// TODO: We should also watch how this process is going.171		// After running this command, we have less than 3 minutes to deploy everything,172		// if we fail to perform generation switch in time, then we will still call the activation script, and this may break something.173		// Anyway, reboot will still help in this case.174		if action.should_schedule_rollback_run() {175			let mut cmd = host.cmd("systemd-run").await?;176			cmd.comparg("--on-active", "3min")177				.comparg("--unit", "rollback-watchdog-run")178				.arg("systemctl")179				.arg("start")180				.arg("rollback-watchdog.service");181			if let Err(e) = cmd.sudo().run().await {182				error!("failed to schedule rollback run: {e}");183				failed = true;184			}185		}186	}187	if deploy_kind == DeployKind::NixosInstall {188		info!(189			"running nixos-install to switch profile, install bootloader, and perform activation"190		);191		let mut cmd = host.cmd("nixos-install").await?;192		cmd.arg("--system").arg(&built).args([193			// Channels here aren't fleet host system channels, but channels embedded in installation cd, which might be old.194			// It is possible to copy host channels, but I would prefer non-flake nix just to be unsupported.195			"--no-channel-copy",196			"--root",197			"/mnt",198		]);199		if let Err(e) = cmd.sudo().run().await {200			error!("failed to execute nixos-install: {e}");201			failed = true;202		}203	} else {204		if action.should_switch_profile() && !failed {205			info!("switching system profile generation");206207			// To avoid even more problems, using nixos-install for now.208			// // nix build is unable to work with --store argument for some reason, and nix until 2.26 didn't support copy with --profile argument,209			// // falling back to using nix-env command210			// // After stable NixOS starts using 2.26 - use `nix --store /mnt copy --from /mnt --profile ...` here, and instead of nix build below.211			// let mut cmd = host.cmd("nix-env").await?;212			// cmd.args([213			// 	"--store",214			// 	"/mnt",215			// 	"--profile",216			// 	"/mnt/nix/var/nix/profiles/system",217			// 	"--set",218			// ])219			// .arg(&built);220			// if let Err(e) = cmd.sudo().run_nix().await {221			// 	error!("failed to switch system profile generation: {e}");222			// 	failed = true;223			// }224			// It would also be possible to update profile atomically during copy:225			// https://github.com/NixOS/nix/pull/11657226			let mut cmd = host.nix_cmd().await?;227			cmd.arg("build");228			cmd.comparg("--profile", "/nix/var/nix/profiles/system");229			cmd.arg(&built);230			if let Err(e) = cmd.sudo().run_nix().await {231				error!("failed to switch system profile generation: {e}");232				failed = true;233			}234		}235236		// FIXME: Connection might be disconnected after activation run237238		if action.should_activate() && !failed {239			let _span = info_span!("activating").entered();240			info!("executing activation script");241			let specialised = if let Some(specialisation) = specialisation {242				let mut specialised = built.join("specialisation");243				specialised.push(specialisation);244				specialised245			} else {246				built.clone()247			};248			let switch_script = specialised.join("bin/switch-to-configuration");249			let mut cmd = host.cmd(switch_script).in_current_span().await?;250			cmd.env("FLEET_ONLINE_ACTIVATION", "1")251				.arg(action.name().expect("upload.should_activate == false"));252			if let Err(e) = cmd.sudo().run().in_current_span().await {253				error!("failed to activate: {e}");254				failed = true;255			}256		}257	}258	if action.should_create_rollback_marker() {259		if !disable_rollback {260			if failed {261				if action.should_schedule_rollback_run() {262					info!("executing rollback");263					if let Err(e) = host264						.systemctl_start("rollback-watchdog.service")265						.instrument(info_span!("rollback"))266						.await267					{268						error!("failed to trigger rollback: {e}")269					}270				}271			} else {272				info!("trying to mark upgrade as successful");273				if let Err(e) = host274					.rm_file("/etc/fleet_rollback_marker", true)275					.in_current_span()276					.await277				{278					error!("failed to remove rollback marker. This is bad, as the system will be rolled back by watchdog: {e}")279				}280			}281			info!("disarming watchdog, just in case");282			if let Err(_e) = host.systemctl_stop("rollback-watchdog.timer").await {283				// It is ok, if there was no reboot - then timer might not be running.284			}285			if action.should_schedule_rollback_run() {286				if let Err(e) = host.systemctl_stop("rollback-watchdog-run.timer").await {287					error!("failed to disarm rollback run: {e}");288				}289			}290		} else if let Err(_e) = host291			.rm_file("/etc/fleet_rollback_marker", true)292			.in_current_span()293			.await294		{295			// Marker might not exist, yet better try to remove it.296		}297	}298	Ok(())299}300301async fn build_task(302	config: Config,303	hostname: String,304	build_attr: &str,305	batch: Option<NixBuildBatch>,306) -> Result<PathBuf> {307	info!("building");308	let host = config.host(&hostname).await?;309	// let action = Action::from(self.subcommand.clone());310	let nixos = host.nixos_config().await?;311	let drv = nix_go!(nixos.system.build[{ build_attr }]);312	let outputs = drv.build_maybe_batch(batch).await?;313	let out_output = outputs314		.get("out")315		.ok_or_else(|| anyhow!("system build should produce \"out\" output"))?;316317	{318		info!("adding gc root");319		let mut cmd = config.local_host().cmd("nix").await?;320		cmd.arg("build")321			.comparg(322				"--profile",323				format!(324					"/nix/var/nix/profiles/{}-{hostname}",325					config.data().gc_root_prefix326				),327			)328			.arg(out_output);329		cmd.sudo().run_nix().await?;330	}331332	Ok(out_output.clone())333}334335impl BuildSystems {336	pub async fn run(self, config: &Config, opts: &FleetOpts) -> Result<()> {337		let hosts = opts.filter_skipped(config.list_hosts().await?).await?;338		let set = LocalSet::new();339		let build_attr = self.build_attr.clone();340		let batch = (hosts.len() > 1).then(|| {341			config342				.nix_session343				.new_build_batch("build-hosts".to_string())344		});345		for host in hosts {346			let config = config.clone();347			let span = info_span!("build", host = field::display(&host.name));348			let hostname = host.name;349			let build_attr = build_attr.clone();350			let batch = batch.clone();351			set.spawn_local(352				(async move {353					let built = match build_task(config, hostname.clone(), &build_attr, batch).await354					{355						Ok(path) => path,356						Err(e) => {357							error!("failed to deploy host: {}", e);358							return;359						}360					};361					// TODO: Handle error362					let mut out = current_dir().expect("cwd exists");363					out.push(format!("built-{}", hostname));364365					info!("linking iso image to {:?}", out);366					if let Err(e) = symlink(built, out) {367						error!("failed to symlink: {e}")368					}369				})370				.instrument(span),371			);372		}373		drop(batch);374		set.await;375		Ok(())376	}377}378379impl Deploy {380	pub async fn run(self, config: &Config, opts: &FleetOpts) -> Result<()> {381		let hosts = opts.filter_skipped(config.list_hosts().await?).await?;382		let set = LocalSet::new();383		let batch = (hosts.len() > 1).then(|| {384			config385				.nix_session386				.new_build_batch("deploy-hosts".to_string())387		});388		for host in hosts.into_iter() {389			let config = config.clone();390			let span = info_span!("deploy", host = field::display(&host.name));391			let hostname = host.name.clone();392			let local_host = config.local_host();393			let opts = opts.clone();394			let batch = batch.clone();395			if let Some(deploy_kind) = opts.action_attr::<DeployKind>(&host, "deploy_kind").await? {396				host.set_deploy_kind(deploy_kind);397			};398399			set.spawn_local(400				(async move {401					let built =402						match build_task(config.clone(), hostname.clone(), "toplevel", batch).await403						{404							Ok(path) => path,405							Err(e) => {406								error!("failed to build host system closure: {}", e);407								return;408							}409						};410411					let deploy_kind = match host.deploy_kind().await {412						Ok(v) => v,413						Err(e) => {414							error!("failed to query target deploy kind: {e}");415							return;416						}417					};418419					// TODO: Make disable_rollback a host attribute instead420					let mut disable_rollback = self.disable_rollback;421					if !disable_rollback && deploy_kind != DeployKind::Fleet {422						warn!("disabling rollback, as not supported by non-fleet deployment kinds");423						disable_rollback = true;424					}425426					if !opts.is_local(&hostname) {427						info!("uploading system closure");428						{429							// TODO: Move to remote_derivation method.430							// Alternatively, nix store make-content-addressed can be used,431							// at least for the first deployment, to provide trusted store key.432							//433							// It is much slower, yet doesn't require root on the deployer machine.434							let Ok(mut sign) = local_host.cmd("nix").await else {435								error!("failed to setup local");436								return;437							};438							// Private key for host machine is registered in nix-sign.nix439							sign.arg("store")440								.arg("sign")441								.comparg("--key-file", "/etc/nix/private-key")442								.arg("-r")443								.arg(&built);444							if let Err(e) = sign.sudo().run_nix().await {445								warn!("failed to sign store paths: {e}");446							};447						}448						let mut tries = 0;449						loop {450							match host.remote_derivation(&built).await {451								Ok(remote) => {452									assert!(remote == built, "CA derivations aren't implemented");453									break;454								}455								Err(e) if tries < 3 => {456									tries += 1;457									warn!("copy failure ({}/3): {}", tries, e);458									sleep(Duration::from_millis(5000)).await;459								}460								Err(e) => {461									error!("upload failed: {e}");462									return;463								}464							}465						}466					}467					if let Err(e) = deploy_task(468						self.action,469						&host,470						built,471						if let Ok(v) = opts.action_attr(&host, "specialisation").await {472							v473						} else {474							error!("unreachable? failed to get specialization");475							return;476						},477						disable_rollback,478					)479					.await480					{481						error!("activation failed: {e}");482					}483				})484				.instrument(span),485			);486		}487		drop(batch);488		set.await;489		Ok(())490	}491}
after · cmds/fleet/src/cmds/build_systems.rs
1use std::{env::current_dir, os::unix::fs::symlink, path::PathBuf, time::Duration};23use anyhow::{anyhow, bail, Context, Result};4use clap::{Parser, ValueEnum};5use fleet_base::{6	host::{Config, ConfigHost, DeployKind},7	opts::FleetOpts,8};9use itertools::Itertools as _;10use nix_eval::{nix_go, NixBuildBatch};11use tokio::{task::LocalSet, time::sleep};12use tracing::{error, field, info, info_span, warn, Instrument};1314#[derive(Parser)]15pub struct Deploy {16	/// Disable automatic rollback17	#[clap(long)]18	disable_rollback: bool,19	/// Action to execute after system is built20	action: DeployAction,21}2223#[derive(ValueEnum, Clone, Copy)]24enum DeployAction {25	/// Upload derivation, but do not execute the update.26	Upload,27	/// Upload and execute the activation script, old version will be used after reboot.28	Test,29	/// Upload and set as current system profile, but do not execute activation script.30	Boot,31	/// Upload, set current profile, and execute activation script.32	Switch,33}3435impl DeployAction {36	pub(crate) fn name(&self) -> Option<&'static str> {37		match self {38			Self::Upload => None,39			Self::Test => Some("test"),40			Self::Boot => Some("boot"),41			Self::Switch => Some("switch"),42		}43	}44	pub(crate) fn should_switch_profile(&self) -> bool {45		matches!(self, Self::Switch | Self::Boot)46	}47	pub(crate) fn should_activate(&self) -> bool {48		matches!(self, Self::Switch | Self::Test | Self::Boot)49	}50	pub(crate) fn should_create_rollback_marker(&self) -> bool {51		// Upload does nothing on the target machine, other than uploading the closure.52		// In boot case we want to have rollback marker prepared, so that the system may rollback itself on the next boot.53		!matches!(self, Self::Upload)54	}55	pub(crate) fn should_schedule_rollback_run(&self) -> bool {56		matches!(self, Self::Switch | Self::Test)57	}58}5960#[derive(Parser, Clone)]61pub struct BuildSystems {62	/// Attribute to build. Systems are deployed from "toplevel" attr, well-known used attributes63	/// are "sdImage"/"isoImage", and your configuration may include any other build attributes.64	#[clap(long, default_value = "toplevel")]65	build_attr: String,66}6768struct Generation {69	id: u32,70	current: bool,71	datetime: String,72}7374fn parse_generation_line(g: &str) -> Option<Generation> {75	let mut parts = g.split_whitespace();76	let id = parts.next()?;77	let id: u32 = id.parse().ok()?;78	let date = parts.next()?;79	let time = parts.next()?;80	let current = if let Some(current) = parts.next() {81		if current == "(current)" {82			Some(true)83		} else {84			None85		}86	} else {87		Some(false)88	};89	let current = current?;90	if parts.next().is_some() {91		warn!("unexpected text after generation: {g}");92	}93	Some(Generation {94		id,95		current,96		datetime: format!("{date} {time}"),97	})98}99100async fn get_current_generation(host: &ConfigHost) -> Result<Generation> {101	let mut cmd = host.cmd("nix-env").await?;102	cmd.comparg("--profile", "/nix/var/nix/profiles/system")103		.arg("--list-generations");104	// Sudo is required due to --list-generations acquiring lock on the profile.105	let data = cmd.sudo().run_string().await?;106	let generations = data107		.split('\n')108		.map(|e| e.trim())109		.filter(|&l| !l.is_empty())110		.filter_map(|g| {111			let gen = parse_generation_line(g);112			if gen.is_none() {113				warn!("bad generation: {g}");114			}115			gen116		})117		.collect::<Vec<_>>();118	let current = generations119		.into_iter()120		.filter(|g| g.current)121		.at_most_one()122		.map_err(|_e| anyhow!("bad list-generations output"))?123		.ok_or_else(|| anyhow!("failed to find generation"))?;124	Ok(current)125}126127async fn deploy_task(128	action: DeployAction,129	host: &ConfigHost,130	built: PathBuf,131	specialisation: Option<String>,132	disable_rollback: bool,133) -> Result<()> {134	let deploy_kind = host.deploy_kind().await?;135	if (deploy_kind == DeployKind::NixosInstall || deploy_kind == DeployKind::NixosLustrate)136		&& !matches!(action, DeployAction::Boot | DeployAction::Upload)137	{138		bail!("{deploy_kind:?} deploy kind only supports boot and upload actions");139	}140141	let mut failed = false;142143	// TODO: Lockfile, to prevent concurrent system switch?144	// TODO: If rollback target exists - bail, it should be removed. Lockfile will not work in case if rollback145	// is scheduler on next boot (default behavior). On current boot - rollback activator will fail due to146	// unit name conflict in systemd-run147	// This code is tied to rollback.nix148	if !disable_rollback && action.should_create_rollback_marker() {149		let _span = info_span!("preparing").entered();150		info!("preparing for rollback");151		let generation = get_current_generation(host).await?;152		info!(153			"rollback target would be {} {}",154			generation.id, generation.datetime155		);156		{157			let mut cmd = host.cmd("sh").await?;158			cmd.arg("-c").arg(format!("mark=$(mktemp -p /etc -t fleet_rollback_marker.XXXXX) && echo -n {} > $mark && mv --no-clobber $mark /etc/fleet_rollback_marker", generation.id));159			if let Err(e) = cmd.sudo().run().await {160				error!("failed to set rollback marker: {e}");161				failed = true;162			}163		}164		// Activation script also starts rollback-watchdog.timer, however, it is possible that it won't be started.165		// Kicking it on manually will work best.166		//167		// There wouldn't be conflict, because here we trigger start of the primary service, and systemd will168		// only allow one instance of it.169170		// TODO: We should also watch how this process is going.171		// After running this command, we have less than 3 minutes to deploy everything,172		// if we fail to perform generation switch in time, then we will still call the activation script, and this may break something.173		// Anyway, reboot will still help in this case.174		if action.should_schedule_rollback_run() {175			let mut cmd = host.cmd("systemd-run").await?;176			cmd.comparg("--on-active", "3min")177				.comparg("--unit", "rollback-watchdog-run")178				.arg("systemctl")179				.arg("start")180				.arg("rollback-watchdog.service");181			if let Err(e) = cmd.sudo().run().await {182				error!("failed to schedule rollback run: {e}");183				failed = true;184			}185		}186	}187	if deploy_kind == DeployKind::NixosLustrate {188		// Fleet could also create this file, but as this operation is potentially disruptive,189		// make user do it themself.190		if !host.file_exists("/etc/NIXOS_LUSTRATE").await? {191			bail!("/etc/NIXOS_LUSTRATE should be created on remote host");192		}193		// Wanted by NixOS to recognize the system as NixOS.194		let mut cmd = host.cmd("touch").await?;195		cmd.arg("/etc/NIXOS");196		cmd.sudo().run().await.context("creating /etc/NIXOS")?;197	}198	if deploy_kind == DeployKind::NixosInstall {199		info!(200			"running nixos-install to switch profile, install bootloader, and perform activation"201		);202		let mut cmd = host.cmd("nixos-install").await?;203		cmd.arg("--system").arg(&built).args([204			// Channels here aren't fleet host system channels, but channels embedded in installation cd, which might be old.205			// It is possible to copy host channels, but I would prefer non-flake nix just to be unsupported.206			"--no-channel-copy",207			"--root",208			"/mnt",209		]);210		if let Err(e) = cmd.sudo().run().await {211			error!("failed to execute nixos-install: {e}");212			failed = true;213		}214	} else {215		if action.should_switch_profile() && !failed {216			info!("switching system profile generation");217218			// To avoid even more problems, using nixos-install for now.219			// // nix build is unable to work with --store argument for some reason, and nix until 2.26 didn't support copy with --profile argument,220			// // falling back to using nix-env command221			// // After stable NixOS starts using 2.26 - use `nix --store /mnt copy --from /mnt --profile ...` here, and instead of nix build below.222			// let mut cmd = host.cmd("nix-env").await?;223			// cmd.args([224			// 	"--store",225			// 	"/mnt",226			// 	"--profile",227			// 	"/mnt/nix/var/nix/profiles/system",228			// 	"--set",229			// ])230			// .arg(&built);231			// if let Err(e) = cmd.sudo().run_nix().await {232			// 	error!("failed to switch system profile generation: {e}");233			// 	failed = true;234			// }235			// It would also be possible to update profile atomically during copy:236			// https://github.com/NixOS/nix/pull/11657237			let mut cmd = host.nix_cmd().await?;238			cmd.arg("build");239			cmd.comparg("--profile", "/nix/var/nix/profiles/system");240			cmd.arg(&built);241			if let Err(e) = cmd.sudo().run_nix().await {242				error!("failed to switch system profile generation: {e}");243				failed = true;244			}245		}246247		// FIXME: Connection might be disconnected after activation run248249		if action.should_activate() && !failed {250			let _span = info_span!("activating").entered();251			info!("executing activation script");252			let specialised = if let Some(specialisation) = specialisation {253				let mut specialised = built.join("specialisation");254				specialised.push(specialisation);255				specialised256			} else {257				built.clone()258			};259			let switch_script = specialised.join("bin/switch-to-configuration");260			let mut cmd = host.cmd(switch_script).in_current_span().await?;261			if deploy_kind == DeployKind::NixosLustrate {262				cmd.env("NIXOS_INSTALL_BOOTLOADER", "1");263			}264			cmd.env("FLEET_ONLINE_ACTIVATION", "1")265				.arg(action.name().expect("upload.should_activate == false"));266			if let Err(e) = cmd.sudo().run().in_current_span().await {267				error!("failed to activate: {e}");268				failed = true;269			}270		}271	}272	if action.should_create_rollback_marker() {273		if !disable_rollback {274			if failed {275				if action.should_schedule_rollback_run() {276					info!("executing rollback");277					if let Err(e) = host278						.systemctl_start("rollback-watchdog.service")279						.instrument(info_span!("rollback"))280						.await281					{282						error!("failed to trigger rollback: {e}")283					}284				}285			} else {286				info!("trying to mark upgrade as successful");287				if let Err(e) = host288					.rm_file("/etc/fleet_rollback_marker", true)289					.in_current_span()290					.await291				{292					error!("failed to remove rollback marker. This is bad, as the system will be rolled back by watchdog: {e}")293				}294			}295			info!("disarming watchdog, just in case");296			if let Err(_e) = host.systemctl_stop("rollback-watchdog.timer").await {297				// It is ok, if there was no reboot - then timer might not be running.298			}299			if action.should_schedule_rollback_run() {300				if let Err(e) = host.systemctl_stop("rollback-watchdog-run.timer").await {301					error!("failed to disarm rollback run: {e}");302				}303			}304		} else if let Err(_e) = host305			.rm_file("/etc/fleet_rollback_marker", true)306			.in_current_span()307			.await308		{309			// Marker might not exist, yet better try to remove it.310		}311	}312	Ok(())313}314315async fn build_task(316	config: Config,317	hostname: String,318	build_attr: &str,319	batch: Option<NixBuildBatch>,320) -> Result<PathBuf> {321	info!("building");322	let host = config.host(&hostname).await?;323	// let action = Action::from(self.subcommand.clone());324	let nixos = host.nixos_config().await?;325	let drv = nix_go!(nixos.system.build[{ build_attr }]);326	let outputs = drv.build_maybe_batch(batch).await?;327	let out_output = outputs328		.get("out")329		.ok_or_else(|| anyhow!("system build should produce \"out\" output"))?;330331	{332		info!("adding gc root");333		let mut cmd = config.local_host().cmd("nix").await?;334		cmd.arg("build")335			.comparg(336				"--profile",337				format!(338					"/nix/var/nix/profiles/{}-{hostname}",339					config.data().gc_root_prefix340				),341			)342			.arg(out_output);343		cmd.sudo().run_nix().await?;344	}345346	Ok(out_output.clone())347}348349impl BuildSystems {350	pub async fn run(self, config: &Config, opts: &FleetOpts) -> Result<()> {351		let hosts = opts.filter_skipped(config.list_hosts().await?).await?;352		let set = LocalSet::new();353		let build_attr = self.build_attr.clone();354		let batch = (hosts.len() > 1).then(|| {355			config356				.nix_session357				.new_build_batch("build-hosts".to_string())358		});359		for host in hosts {360			let config = config.clone();361			let span = info_span!("build", host = field::display(&host.name));362			let hostname = host.name;363			let build_attr = build_attr.clone();364			let batch = batch.clone();365			set.spawn_local(366				(async move {367					let built = match build_task(config, hostname.clone(), &build_attr, batch).await368					{369						Ok(path) => path,370						Err(e) => {371							error!("failed to deploy host: {}", e);372							return;373						}374					};375					// TODO: Handle error376					let mut out = current_dir().expect("cwd exists");377					out.push(format!("built-{}", hostname));378379					info!("linking iso image to {:?}", out);380					if let Err(e) = symlink(built, out) {381						error!("failed to symlink: {e}")382					}383				})384				.instrument(span),385			);386		}387		drop(batch);388		set.await;389		Ok(())390	}391}392393impl Deploy {394	pub async fn run(self, config: &Config, opts: &FleetOpts) -> Result<()> {395		let hosts = opts.filter_skipped(config.list_hosts().await?).await?;396		let set = LocalSet::new();397		let batch = (hosts.len() > 1).then(|| {398			config399				.nix_session400				.new_build_batch("deploy-hosts".to_string())401		});402		for host in hosts.into_iter() {403			let config = config.clone();404			let span = info_span!("deploy", host = field::display(&host.name));405			let hostname = host.name.clone();406			let local_host = config.local_host();407			let opts = opts.clone();408			let batch = batch.clone();409			if let Some(deploy_kind) = opts.action_attr::<DeployKind>(&host, "deploy_kind").await? {410				host.set_deploy_kind(deploy_kind);411			};412413			set.spawn_local(414				(async move {415					let built =416						match build_task(config.clone(), hostname.clone(), "toplevel", batch).await417						{418							Ok(path) => path,419							Err(e) => {420								error!("failed to build host system closure: {}", e);421								return;422							}423						};424425					let deploy_kind = match host.deploy_kind().await {426						Ok(v) => v,427						Err(e) => {428							error!("failed to query target deploy kind: {e}");429							return;430						}431					};432433					// TODO: Make disable_rollback a host attribute instead434					let mut disable_rollback = self.disable_rollback;435					if !disable_rollback && deploy_kind != DeployKind::Fleet {436						warn!("disabling rollback, as not supported by non-fleet deployment kinds");437						disable_rollback = true;438					}439440					if !opts.is_local(&hostname) {441						info!("uploading system closure");442						{443							// TODO: Move to remote_derivation method.444							// Alternatively, nix store make-content-addressed can be used,445							// at least for the first deployment, to provide trusted store key.446							//447							// It is much slower, yet doesn't require root on the deployer machine.448							let Ok(mut sign) = local_host.cmd("nix").await else {449								error!("failed to setup local");450								return;451							};452							// Private key for host machine is registered in nix-sign.nix453							sign.arg("store")454								.arg("sign")455								.comparg("--key-file", "/etc/nix/private-key")456								.arg("-r")457								.arg(&built);458							if let Err(e) = sign.sudo().run_nix().await {459								warn!("failed to sign store paths: {e}");460							};461						}462						let mut tries = 0;463						loop {464							match host.remote_derivation(&built).await {465								Ok(remote) => {466									assert!(remote == built, "CA derivations aren't implemented");467									break;468								}469								Err(e) if tries < 3 => {470									tries += 1;471									warn!("copy failure ({}/3): {}", tries, e);472									sleep(Duration::from_millis(5000)).await;473								}474								Err(e) => {475									error!("upload failed: {e}");476									return;477								}478							}479						}480					}481					if let Err(e) = deploy_task(482						self.action,483						&host,484						built,485						if let Ok(v) = opts.action_attr(&host, "specialisation").await {486							v487						} else {488							error!("unreachable? failed to get specialization");489							return;490						},491						disable_rollback,492					)493					.await494					{495						error!("activation failed: {e}");496					}497				})498				.instrument(span),499			);500		}501		drop(batch);502		set.await;503		Ok(())504	}505}
modifiedcrates/fleet-base/src/host.rsdiffbeforeafterboth
--- a/crates/fleet-base/src/host.rs
+++ b/crates/fleet-base/src/host.rs
@@ -23,8 +23,10 @@
 };
 
 pub struct FleetConfigInternals {
+	/// Fleet project directory, containing fleet.nix file.
+	pub directory: PathBuf,
+	/// builtins.currentSystem
 	pub local_system: String,
-	pub directory: PathBuf,
 	pub data: Mutex<FleetData>,
 	pub nix_args: Vec<OsString>,
 	/// fleet_config.config
@@ -34,6 +36,7 @@
 
 	/// import nixpkgs {system = local};
 	pub default_pkgs: Value,
+	/// inputs.nixpkgs
 	pub nixpkgs: Value,
 
 	pub nix_session: NixSession,
@@ -58,7 +61,7 @@
 	Su,
 }
 
-#[derive(Clone, PartialEq, Copy)]
+#[derive(Clone, PartialEq, Copy, Debug)]
 pub enum DeployKind {
 	/// NixOS => NixOS managed by fleet
 	UpgradeToFleet,
@@ -67,6 +70,10 @@
 	/// Remote host has /mnt, /mnt/boot mounted,
 	/// generated config is added to fleet configuration.
 	NixosInstall,
+	/// Remote host has some system and nix installed in multi-user mode (/nix is owned by root),
+	/// generated config is added to fleet configuration,
+	/// and /etc/NIXOS_LUSTRATE exists, fleet will perform the rest.
+	NixosLustrate,
 }
 
 impl FromStr for DeployKind {
@@ -302,7 +309,7 @@
 		nix.arg("copy").arg("--substitute-on-destination");
 
 		match self.deploy_kind().await? {
-			DeployKind::Fleet | DeployKind::UpgradeToFleet => {
+			DeployKind::Fleet | DeployKind::UpgradeToFleet | DeployKind::NixosLustrate => {
 				nix.comparg("--to", format!("ssh-ng://{}", self.name));
 			}
 			DeployKind::NixosInstall => {
modifiedcrates/fleet-base/src/opts.rsdiffbeforeafterboth
--- a/crates/fleet-base/src/opts.rs
+++ b/crates/fleet-base/src/opts.rs
@@ -6,7 +6,7 @@
 	sync::{Arc, Mutex},
 };
 
-use anyhow::{Context, Result};
+use anyhow::{bail, Context, Result};
 use clap::Parser;
 use nix_eval::{nix_go, util::assert_warn, NixSessionPool, Value};
 use nom::{
@@ -182,7 +182,23 @@
 
 	// TODO: Config should be detached from opts.
 	pub async fn build(&self, nix_args: Vec<OsString>, assert: bool) -> Result<Config> {
-		let directory = current_dir()?;
+		let cwd = current_dir()?;
+		let mut directory = cwd.clone();
+		let mut fleet_data_path = directory.join("fleet.nix");
+		while !fleet_data_path.is_file() {
+			// fleet.nix
+			fleet_data_path.pop();
+			if !directory.pop() || !fleet_data_path.pop() {
+				bail!(
+					"fleet.nix not found at {} or any of the parent directories",
+					cwd.display()
+				);
+			}
+			fleet_data_path.push("fleet.nix");
+		}
+		let bytes =
+			std::fs::read_to_string(&fleet_data_path).context("reading fleet state (fleet.nix)")?;
+		let data: Mutex<FleetData> = nixlike::parse_str(&bytes)?;
 
 		let pool = NixSessionPool::new(
 			directory.as_os_str().to_owned(),
@@ -193,12 +209,6 @@
 		let nix_session = pool.get().await?;
 
 		let builtins_field = Value::binding(nix_session.clone(), "builtins").await?;
-
-		let mut fleet_data_path = directory.clone();
-		fleet_data_path.push("fleet.nix");
-		let bytes =
-			std::fs::read_to_string(fleet_data_path).context("reading fleet state (fleet.nix)")?;
-		let data: Mutex<FleetData> = nixlike::parse_str(&bytes)?;
 
 		let fleet_root = Value::binding(nix_session.clone(), "fleetConfigurations").await?;
 		let fleet_field = nix_go!(fleet_root.default({ data }));