git.delta.rocks / jrsonnet / refs/commits / faec7071817b

difftreelog

source

crates/fleet-base/src/deploy.rs10.3 KiBsourcehistory
1use std::{path::PathBuf, time::Duration};23use anyhow::{Context as _, Result, anyhow, bail};4use clap::ValueEnum;5use itertools::Itertools;6use tokio::time::sleep;7use tracing::{Instrument as _, error, info, info_span, warn};89use crate::host::{Config, ConfigHost, DeployKind, Generation, GenerationStorage};1011#[derive(ValueEnum, Clone, Copy)]12pub enum DeployAction {13	/// Upload derivation, but do not execute the update.14	Upload,15	/// Upload and execute the activation script, old version will be used after reboot.16	Test,17	/// Upload and set as current system profile, but do not execute activation script.18	Boot,19	/// Upload, set current profile, and execute activation script.20	Switch,21}2223impl DeployAction {24	pub(crate) fn name(&self) -> Option<&'static str> {25		match self {26			Self::Upload => None,27			Self::Test => Some("test"),28			Self::Boot => Some("boot"),29			Self::Switch => Some("switch"),30		}31	}32	pub(crate) fn should_switch_profile(&self) -> bool {33		matches!(self, Self::Switch | Self::Boot)34	}35	pub(crate) fn should_activate(&self) -> bool {36		matches!(self, Self::Switch | Self::Test | Self::Boot)37	}38	pub(crate) fn should_create_rollback_marker(&self) -> bool {39		// Upload does nothing on the target machine, other than uploading the closure.40		// In boot case we want to have rollback marker prepared, so that the system may rollback itself on the next boot.41		!matches!(self, Self::Upload)42	}43	pub(crate) fn should_schedule_rollback_run(&self) -> bool {44		matches!(self, Self::Switch | Self::Test)45	}46}4748async fn get_current_generation(host: &ConfigHost) -> Result<Generation> {49	let generations = host.list_generations("system").await?;50	let current = generations51		.into_iter()52		.filter(|g| g.current)53		.at_most_one()54		.map_err(|_e| anyhow!("bad list-generations output"))?55		.ok_or_else(|| anyhow!("failed to find generation"))?;56	Ok(current)57}5859pub async fn deploy_task(60	action: DeployAction,61	host: &ConfigHost,62	built: PathBuf,63	specialisation: Option<String>,64	disable_rollback: bool,65) -> Result<()> {66	let deploy_kind = host.deploy_kind().await?;67	if (deploy_kind == DeployKind::NixosInstall || deploy_kind == DeployKind::NixosLustrate)68		&& !matches!(action, DeployAction::Boot | DeployAction::Upload)69	{70		bail!("{deploy_kind:?} deploy kind only supports boot and upload actions");71	}7273	let mut failed = false;7475	// TODO: Lockfile, to prevent concurrent system switch?76	// TODO: If rollback target exists - bail, it should be removed. Lockfile will not work in case if rollback77	// is scheduler on next boot (default behavior). On current boot - rollback activator will fail due to78	// unit name conflict in systemd-run79	// This code is tied to rollback.nix80	if !disable_rollback && action.should_create_rollback_marker() {81		let _span = info_span!("preparing").entered();82		info!("preparing for rollback");83		let generation = get_current_generation(host).await?;84		info!(85			"rollback target would be {} {}",86			generation.id, generation.datetime87		);88		{89			let mut cmd = host.cmd("sh").await?;90			cmd.arg("-c").arg(format!("mark=$(mktemp -p /etc -t fleet_rollback_marker.XXXXX) && echo -n {} > $mark && mv --no-clobber $mark /etc/fleet_rollback_marker", generation.id));91			if let Err(e) = cmd.sudo().run().await {92				error!("failed to set rollback marker: {e}");93				failed = true;94			}95		}96		// Activation script also starts rollback-watchdog.timer, however, it is possible that it won't be started.97		// Kicking it on manually will work best.98		//99		// There wouldn't be conflict, because here we trigger start of the primary service, and systemd will100		// only allow one instance of it.101102		// TODO: We should also watch how this process is going.103		// After running this command, we have less than 3 minutes to deploy everything,104		// if we fail to perform generation switch in time, then we will still call the activation script, and this may break something.105		// Anyway, reboot will still help in this case.106		if action.should_schedule_rollback_run() {107			let mut cmd = host.cmd("systemd-run").await?;108			cmd.comparg("--on-active", "3min")109				.comparg("--unit", "rollback-watchdog-run")110				.arg("systemctl")111				.arg("start")112				.arg("rollback-watchdog.service");113			if let Err(e) = cmd.sudo().run().await {114				error!("failed to schedule rollback run: {e}");115				failed = true;116			}117		}118	}119	if deploy_kind == DeployKind::NixosLustrate {120		// Fleet could also create this file, but as this operation is potentially disruptive,121		// make user do it themself.122		if !host.file_exists("/etc/NIXOS_LUSTRATE").await? {123			bail!("/etc/NIXOS_LUSTRATE should be created on remote host");124		}125		// Wanted by NixOS to recognize the system as NixOS.126		let mut cmd = host.cmd("touch").await?;127		cmd.arg("/etc/NIXOS");128		cmd.sudo().run().await.context("creating /etc/NIXOS")?;129	}130	if deploy_kind == DeployKind::NixosInstall {131		info!(132			"running nixos-install to switch profile, install bootloader, and perform activation"133		);134		let mut cmd = host.cmd("nixos-install").await?;135		cmd.arg("--system").arg(&built).args([136			// Channels here aren't fleet host system channels, but channels embedded in installation cd, which might be old.137			// It is possible to copy host channels, but I would prefer non-flake nix just to be unsupported.138			"--no-channel-copy",139			"--root",140			"/mnt",141		]);142		if let Err(e) = cmd.sudo().run().await {143			error!("failed to execute nixos-install: {e}");144			failed = true;145		}146	} else {147		if action.should_switch_profile() && !failed {148			info!("switching system profile generation");149150			// To avoid even more problems, using nixos-install for now.151			// // nix build is unable to work with --store argument for some reason, and nix until 2.26 didn't support copy with --profile argument,152			// // falling back to using nix-env command153			// // After stable NixOS starts using 2.26 - use `nix --store /mnt copy --from /mnt --profile ...` here, and instead of nix build below.154			// let mut cmd = host.cmd("nix-env").await?;155			// cmd.args([156			// 	"--store",157			// 	"/mnt",158			// 	"--profile",159			// 	"/mnt/nix/var/nix/profiles/system",160			// 	"--set",161			// ])162			// .arg(&built);163			// if let Err(e) = cmd.sudo().run_nix().await {164			// 	error!("failed to switch system profile generation: {e}");165			// 	failed = true;166			// }167			// It would also be possible to update profile atomically during copy:168			// https://github.com/NixOS/nix/pull/11657169			let mut cmd = host.nix_cmd().await?;170			cmd.arg("build");171			cmd.comparg("--profile", "/nix/var/nix/profiles/system");172			cmd.arg(&built);173			if let Err(e) = cmd.sudo().run_nix().await {174				error!("failed to switch system profile generation: {e}");175				failed = true;176			}177		}178179		// FIXME: Connection might be disconnected after activation run180181		if action.should_activate() && !failed {182			let _span = info_span!("activating").entered();183			info!("executing activation script");184			let specialised = if let Some(specialisation) = specialisation {185				let mut specialised = built.join("specialisation");186				specialised.push(specialisation);187				specialised188			} else {189				built.clone()190			};191			let switch_script = specialised.join("bin/switch-to-configuration");192			let mut cmd = host.cmd("systemd-run").in_current_span().await?;193			cmd.arg("--collect")194				.arg("--no-ask-password")195				.arg("--pipe")196				.arg("--quiet")197				.arg("--service-type=exec")198				.arg("--unit=fleet-switch-to-configuration")199				.arg(switch_script);200			if deploy_kind == DeployKind::NixosLustrate {201				cmd.env("NIXOS_INSTALL_BOOTLOADER", "1");202			}203			cmd.env("FLEET_ONLINE_ACTIVATION", "1")204				.arg(action.name().expect("upload.should_activate == false"));205			if let Err(e) = cmd.sudo().run().in_current_span().await {206				error!("failed to activate: {e}");207				failed = true;208			}209		}210	}211	if action.should_create_rollback_marker() {212		if !disable_rollback {213			if failed {214				if action.should_schedule_rollback_run() {215					info!("executing rollback");216					if let Err(e) = host217						.systemctl_start("rollback-watchdog.service")218						.instrument(info_span!("rollback"))219						.await220					{221						error!("failed to trigger rollback: {e}")222					}223				}224			} else {225				info!("trying to mark upgrade as successful");226				if let Err(e) = host227					.rm_file("/etc/fleet_rollback_marker", true)228					.in_current_span()229					.await230				{231					error!(232						"failed to remove rollback marker. This is bad, as the system will be rolled back by watchdog: {e}"233					)234				}235			}236			info!("disarming watchdog, just in case");237			if let Err(_e) = host.systemctl_stop("rollback-watchdog.timer").await {238				// It is ok, if there was no reboot - then timer might not be running.239			}240			if action.should_schedule_rollback_run() {241				if let Err(e) = host.systemctl_stop("rollback-watchdog-run.timer").await {242					error!("failed to disarm rollback run: {e}");243				}244			}245		} else if let Err(_e) = host246			.rm_file("/etc/fleet_rollback_marker", true)247			.in_current_span()248			.await249		{250			// Marker might not exist, yet better try to remove it.251		}252	}253	Ok(())254}255256pub async fn upload_task(257	config: &Config,258	host: &ConfigHost,259	location: GenerationStorage,260	generation: PathBuf,261) -> Result<PathBuf> {262	let local_host = config.local_host();263	if matches!(location, GenerationStorage::Pusher) {264		bail!("pusher is not enabled in this version of fleet");265	}266	if !host.local {267		info!("uploading system closure");268		{269			// TODO: Move to remote_derivation method.270			// Alternatively, nix store make-content-addressed can be used,271			// at least for the first deployment, to provide trusted store key.272			//273			// It is much slower, yet doesn't require root on the deployer machine.274			let Ok(mut sign) = local_host.cmd("nix").await else {275				bail!("failed to setup local");276			};277			// Private key for host machine is registered in nix-sign.nix278			sign.arg("store")279				.arg("sign")280				.comparg("--key-file", "/etc/nix/private-key")281				.arg("-r")282				.arg(&generation);283			if let Err(e) = sign.sudo().run_nix().await {284				warn!("failed to sign store paths: {e}");285			};286		}287		let mut tries = 0;288		loop {289			match host.remote_derivation(&generation).await {290				Ok(remote) => {291					assert!(remote == generation, "CA derivations aren't implemented");292					return Ok(remote);293				}294				Err(e) if tries < 3 => {295					tries += 1;296					warn!("copy failure ({}/3): {}", tries, e);297					sleep(Duration::from_millis(5000)).await;298				}299				Err(e) => {300					bail!("upload failed: {e}");301				}302			}303		}304	}305	Ok(generation)306}