git.delta.rocks / jrsonnet / refs/commits / dfbdb4ac5bb1

difftreelog

refactor use nix build to update profile generation

Yaroslav Bolyukin2024-11-14parent: #15fd410.patch.diff
in: trunk

1 file changed

modifiedcmds/fleet/src/cmds/build_systems.rsdiffbeforeafterboth
after · cmds/fleet/src/cmds/build_systems.rs
1use std::{env::current_dir, os::unix::fs::symlink, path::PathBuf, time::Duration};23use anyhow::{anyhow, Result};4use clap::{Parser, ValueEnum};5use fleet_base::{6	host::{Config, ConfigHost},7	opts::FleetOpts,8};9use itertools::Itertools as _;10use nix_eval::nix_go;11use tokio::{task::LocalSet, time::sleep};12use tracing::{error, field, info, info_span, warn, Instrument};1314#[derive(Parser)]15pub struct Deploy {16	/// Disable automatic rollback17	#[clap(long)]18	disable_rollback: bool,19	/// Action to execute after system is built20	action: DeployAction,21}2223#[derive(ValueEnum, Clone, Copy)]24enum DeployAction {25	/// Upload derivation, but do not execute the update.26	Upload,27	/// Upload and execute the activation script, old version will be used after reboot.28	Test,29	/// Upload and set as current system profile, but do not execute activation script.30	Boot,31	/// Upload, set current profile, and execute activation script.32	Switch,33}3435impl DeployAction {36	pub(crate) fn name(&self) -> Option<&'static str> {37		match self {38			Self::Upload => None,39			Self::Test => Some("test"),40			Self::Boot => Some("boot"),41			Self::Switch => Some("switch"),42		}43	}44	pub(crate) fn should_switch_profile(&self) -> bool {45		matches!(self, Self::Switch | Self::Boot)46	}47	pub(crate) fn should_activate(&self) -> bool {48		matches!(self, Self::Switch | Self::Test | Self::Boot)49	}50	pub(crate) fn should_create_rollback_marker(&self) -> bool {51		// Upload does nothing on the target machine, other than uploading the closure.52		// In boot case we want to have rollback marker prepared, so that the system may rollback itself on the next boot.53		!matches!(self, Self::Upload)54	}55	pub(crate) fn should_schedule_rollback_run(&self) -> bool {56		matches!(self, Self::Switch | Self::Test)57	}58}5960#[derive(Parser, Clone)]61pub struct BuildSystems {62	/// Attribute to build. Systems are deployed from "toplevel" attr, well-known used attributes63	/// are "sdImage"/"isoImage", and your configuration may include any other build attributes.64	#[clap(long, default_value = "toplevel")]65	build_attr: String,66}6768struct Generation {69	id: u32,70	current: bool,71	datetime: String,72}73async fn get_current_generation(host: &ConfigHost) -> Result<Generation> {74	let mut cmd = host.cmd("nix-env").await?;75	cmd.comparg("--profile", "/nix/var/nix/profiles/system")76		.arg("--list-generations");77	// Sudo is required due to --list-generations acquiring lock on the profile.78	let data = cmd.sudo().run_string().await?;79	let generations = data80		.split('\n')81		.map(|e| e.trim())82		.filter(|&l| !l.is_empty())83		.filter_map(|g| {84			let gen: Option<Generation> = try {85				let mut parts = g.split_whitespace();86				let id = parts.next()?;87				let id: u32 = id.parse().ok()?;88				let date = parts.next()?;89				let time = parts.next()?;90				let current = if let Some(current) = parts.next() {91					if current == "(current)" {92						Some(true)93					} else {94						None95					}96				} else {97					Some(false)98				};99				let current = current?;100				if parts.next().is_some() {101					warn!("unexpected text after generation: {g}");102				}103				Generation {104					id,105					current,106					datetime: format!("{date} {time}"),107				}108			};109			if gen.is_none() {110				warn!("bad generation: {g}")111			}112			gen113		})114		.collect::<Vec<_>>();115	let current = generations116		.into_iter()117		.filter(|g| g.current)118		.at_most_one()119		.map_err(|_e| anyhow!("bad list-generations output"))?120		.ok_or_else(|| anyhow!("failed to find generation"))?;121	Ok(current)122}123124async fn deploy_task(125	action: DeployAction,126	host: &ConfigHost,127	built: PathBuf,128	specialisation: Option<String>,129	disable_rollback: bool,130) -> Result<()> {131	let mut failed = false;132	// TODO: Lockfile, to prevent concurrent system switch?133	// TODO: If rollback target exists - bail, it should be removed. Lockfile will not work in case if rollback134	// is scheduler on next boot (default behavior). On current boot - rollback activator will fail due to135	// unit name conflict in systemd-run136	// This code is tied to rollback.nix137	if !disable_rollback && action.should_create_rollback_marker() {138		let _span = info_span!("preparing").entered();139		info!("preparing for rollback");140		let generation = get_current_generation(host).await?;141		info!(142			"rollback target would be {} {}",143			generation.id, generation.datetime144		);145		{146			let mut cmd = host.cmd("sh").await?;147			cmd.arg("-c").arg(format!("mark=$(mktemp -p /etc -t fleet_rollback_marker.XXXXX) && echo -n {} > $mark && mv --no-clobber $mark /etc/fleet_rollback_marker", generation.id));148			if let Err(e) = cmd.sudo().run().await {149				error!("failed to set rollback marker: {e}");150				failed = true;151			}152		}153		// Activation script also starts rollback-watchdog.timer, however, it is possible that it won't be started.154		// Kicking it on manually will work best.155		//156		// There wouldn't be conflict, because here we trigger start of the primary service, and systemd will157		// only allow one instance of it.158159		// TODO: We should also watch how this process is going.160		// After running this command, we have less than 3 minutes to deploy everything,161		// if we fail to perform generation switch in time, then we will still call the activation script, and this may break something.162		// Anyway, reboot will still help in this case.163		if action.should_schedule_rollback_run() {164			let mut cmd = host.cmd("systemd-run").await?;165			cmd.comparg("--on-active", "3min")166				.comparg("--unit", "rollback-watchdog-run")167				.arg("systemctl")168				.arg("start")169				.arg("rollback-watchdog.service");170			if let Err(e) = cmd.sudo().run().await {171				error!("failed to schedule rollback run: {e}");172				failed = true;173			}174		}175	}176177	if action.should_switch_profile() && !failed {178		info!("switching system profile generation");179		// It would also be possible to update profile atomically during copy:180		// https://github.com/NixOS/nix/pull/11657181		let mut cmd = host.cmd("nix").await?;182		cmd.arg("build");183		cmd.comparg("--profile", "/nix/var/nix/profiles/system");184		cmd.arg(&built);185		if let Err(e) = cmd.sudo().run_nix().await {186			error!("failed to switch system profile generation: {e}");187			failed = true;188		}189	}190191	// FIXME: Connection might be disconnected after activation run192193	if action.should_activate() && !failed {194		let _span = info_span!("activating").entered();195		info!("executing activation script");196		let specialised = if let Some(specialisation) = specialisation {197			let mut specialised = built.join("specialisation");198			specialised.push(specialisation);199			specialised200		} else {201			built.clone()202		};203		let switch_script = specialised.join("bin/switch-to-configuration");204		let mut cmd = host.cmd(switch_script).in_current_span().await?;205		cmd.arg(action.name().expect("upload.should_activate == false"));206		if let Err(e) = cmd.sudo().run().in_current_span().await {207			error!("failed to activate: {e}");208			failed = true;209		}210	}211	if action.should_create_rollback_marker() {212		if !disable_rollback {213			if failed {214				if action.should_schedule_rollback_run() {215					info!("executing rollback");216					if let Err(e) = host217						.systemctl_start("rollback-watchdog.service")218						.instrument(info_span!("rollback"))219						.await220					{221						error!("failed to trigger rollback: {e}")222					}223				}224			} else {225				info!("trying to mark upgrade as successful");226				if let Err(e) = host227					.rm_file("/etc/fleet_rollback_marker", true)228					.in_current_span()229					.await230				{231					error!("failed to remove rollback marker. This is bad, as the system will be rolled back by watchdog: {e}")232				}233			}234			info!("disarming watchdog, just in case");235			if let Err(_e) = host.systemctl_stop("rollback-watchdog.timer").await {236				// It is ok, if there was no reboot - then timer might not be running.237			}238			if action.should_schedule_rollback_run() {239				if let Err(e) = host.systemctl_stop("rollback-watchdog-run.timer").await {240					error!("failed to disarm rollback run: {e}");241				}242			}243		} else if let Err(_e) = host244			.rm_file("/etc/fleet_rollback_marker", true)245			.in_current_span()246			.await247		{248			// Marker might not exist, yet better try to remove it.249		}250	}251	Ok(())252}253254async fn build_task(config: Config, host: String, build_attr: &str) -> Result<PathBuf> {255	info!("building");256	let host = config.host(&host).await?;257	// let action = Action::from(self.subcommand.clone());258	let nixos = host.nixos_config().await?;259	let drv = nix_go!(nixos.system.build[{ build_attr }]);260	let outputs = drv.build().await.inspect_err(|_| {261			if build_attr == "sdImage" {262				info!("sd-image build failed");263				info!("Make sure you have imported modulesPath/installer/sd-card/sd-image-<arch>[-installer].nix (For installer, you may want to check config)");264			}265		})?;266	let out_output = outputs267		.get("out")268		.ok_or_else(|| anyhow!("system build should produce \"out\" output"))?;269270	Ok(out_output.clone())271}272273impl BuildSystems {274	pub async fn run(self, config: &Config, opts: &FleetOpts) -> Result<()> {275		let hosts = config.list_hosts().await?;276		let set = LocalSet::new();277		let build_attr = self.build_attr.clone();278		for host in hosts.into_iter() {279			if opts.should_skip(&host).await? {280				continue;281			}282			let config = config.clone();283			let span = info_span!("build", host = field::display(&host.name));284			let hostname = host.name;285			let build_attr = build_attr.clone();286			// FIXME: Since the introduction of better-nix-eval,287			// due to single repl used for builds, hosts are waiting for each other to build,288			// instead of building concurrently.289			//290			// Open multiple repls?291			//292			// Create build batcher, which will behave similar to golangs293			// WaitGroup, and start executing once all the build tasks are scheduled?294			// This also allows to cleanup build output, as there will be no longer295			// "waiting for remote machine" messages in the cases when one package is needed for296			// multiple hosts.297			set.spawn_local(298				(async move {299					let built = match build_task(config, hostname.clone(), &build_attr).await {300						Ok(path) => path,301						Err(e) => {302							error!("failed to deploy host: {}", e);303							return;304						}305					};306					// TODO: Handle error307					let mut out = current_dir().expect("cwd exists");308					out.push(format!("built-{}", hostname));309310					info!("linking iso image to {:?}", out);311					if let Err(e) = symlink(built, out) {312						error!("failed to symlink: {e}")313					}314				})315				.instrument(span),316			);317		}318		set.await;319		Ok(())320	}321}322323impl Deploy {324	pub async fn run(self, config: &Config, opts: &FleetOpts) -> Result<()> {325		let hosts = config.list_hosts().await?;326		let set = LocalSet::new();327		for host in hosts.into_iter() {328			if opts.should_skip(&host).await? {329				continue;330			}331			let config = config.clone();332			let span = info_span!("deploy", host = field::display(&host.name));333			let hostname = host.name.clone();334			let local_host = config.local_host();335			let opts = opts.clone();336			// FIXME: Fix repl concurrency (see build-systems)337			set.spawn_local(338				(async move {339					let built = match build_task(config.clone(), hostname.clone(), "toplevel").await340					{341						Ok(path) => path,342						Err(e) => {343							error!("failed to deploy host: {}", e);344							return;345						}346					};347					if !opts.is_local(&hostname) {348						info!("uploading system closure");349						{350							// TODO: Move to remote_derivation method.351							// Alternatively, nix store make-content-addressed can be used,352							// at least for the first deployment, to provide trusted store key.353							//354							// It is much slower, yet doesn't require root on the deployer machine.355							let Ok(mut sign) = local_host.cmd("nix").await else {356								error!("failed to setup local");357								return;358							};359							// Private key for host machine is registered in nix-sign.nix360							sign.arg("store")361								.arg("sign")362								.comparg("--key-file", "/etc/nix/private-key")363								.arg("-r")364								.arg(&built);365							if let Err(e) = sign.sudo().run_nix().await {366								warn!("failed to sign store paths: {e}");367							};368						}369						let mut tries = 0;370						loop {371							match host.remote_derivation(&built).await {372								Ok(remote) => {373									assert!(remote == built, "CA derivations aren't implemented");374									break;375								}376								Err(e) if tries < 3 => {377									tries += 1;378									warn!("copy failure ({}/3): {}", tries, e);379									sleep(Duration::from_millis(5000)).await;380								}381								Err(e) => {382									error!("upload failed: {e}");383									return;384								}385							}386						}387					}388					if let Err(e) = deploy_task(389						self.action,390						&host,391						built,392						if let Ok(v) = opts.action_attr(&host, "specialisation").await {393							v394						} else {395							error!("unreachable? failed to get specialization");396							return;397						},398						self.disable_rollback,399					)400					.await401					{402						error!("activation failed: {e}");403					}404				})405				.instrument(span),406			);407		}408		set.await;409		Ok(())410	}411}