git.delta.rocks / jrsonnet / refs/commits / d9405376effd

difftreelog

source

cmds/fleet/src/cmds/build_systems.rs12.0 KiBsourcehistory
1use std::os::unix::fs::symlink;2use std::path::PathBuf;3use std::{env::current_dir, time::Duration};45use crate::command::MyCommand;6use crate::host::{Config, ConfigHost};7use anyhow::{anyhow, Result};8use clap::{Parser, ValueEnum};9use itertools::Itertools as _;10use nix_eval::nix_go;11use tokio::{task::LocalSet, time::sleep};12use tracing::{error, field, info, info_span, warn, Instrument};1314#[derive(Parser)]15pub struct Deploy {16	/// Disable automatic rollback17	#[clap(long)]18	disable_rollback: bool,19	/// Action to execute after system is built20	action: DeployAction,21}2223#[derive(ValueEnum, Clone, Copy)]24enum DeployAction {25	/// Upload derivation, but do not execute the update.26	Upload,27	/// Upload and execute the activation script, old version will be used after reboot.28	Test,29	/// Upload and set as current system profile, but do not execute activation script.30	Boot,31	/// Upload, set current profile, and execute activation script.32	Switch,33}3435impl DeployAction {36	pub(crate) fn name(&self) -> Option<&'static str> {37		match self {38			DeployAction::Upload => None,39			DeployAction::Test => Some("test"),40			DeployAction::Boot => Some("boot"),41			DeployAction::Switch => Some("switch"),42		}43	}44	pub(crate) fn should_switch_profile(&self) -> bool {45		matches!(self, Self::Switch | Self::Boot)46	}47	pub(crate) fn should_activate(&self) -> bool {48		matches!(self, Self::Switch | Self::Test)49	}50	pub(crate) fn should_create_rollback_marker(&self) -> bool {51		// Upload does nothing on the target machine, other than uploading the closure.52		// In boot case we want to have rollback marker prepared, so that the system may rollback itself on the next boot.53		!matches!(self, Self::Upload)54	}55	pub(crate) fn should_schedule_rollback_run(&self) -> bool {56		matches!(self, Self::Switch | Self::Test)57	}58}5960#[derive(Parser, Clone)]61pub struct BuildSystems {62	/// Attribute to build. Systems are deployed from "toplevel" attr, well-known used attributes63	/// are "sdImage"/"isoImage", and your configuration may include any other build attributes.64	#[clap(long, default_value = "toplevel")]65	build_attr: String,66}6768struct Generation {69	id: u32,70	current: bool,71	datetime: String,72}73async fn get_current_generation(host: &ConfigHost) -> Result<Generation> {74	let mut cmd = host.cmd("nix-env").await?;75	cmd.comparg("--profile", "/nix/var/nix/profiles/system")76		.arg("--list-generations");77	// Sudo is required due to --list-generations acquiring lock on the profile.78	let data = cmd.sudo().run_string().await?;79	let generations = data80		.split('\n')81		.map(|e| e.trim())82		.filter(|&l| !l.is_empty())83		.filter_map(|g| {84			let gen: Option<Generation> = try {85				let mut parts = g.split_whitespace();86				let id = parts.next()?;87				let id: u32 = id.parse().ok()?;88				let date = parts.next()?;89				let time = parts.next()?;90				let current = if let Some(current) = parts.next() {91					if current == "(current)" {92						Some(true)93					} else {94						None95					}96				} else {97					Some(false)98				};99				let current = current?;100				if parts.next().is_some() {101					warn!("unexpected text after generation: {g}");102				}103				Generation {104					id,105					current,106					datetime: format!("{date} {time}"),107				}108			};109			if gen.is_none() {110				warn!("bad generation: {g}")111			}112			gen113		})114		.collect::<Vec<_>>();115	let current = generations116		.into_iter()117		.filter(|g| g.current)118		.at_most_one()119		.map_err(|_e| anyhow!("bad list-generations output"))?120		.ok_or_else(|| anyhow!("failed to find generation"))?;121	Ok(current)122}123124async fn deploy_task(125	action: DeployAction,126	host: &ConfigHost,127	built: PathBuf,128	disable_rollback: bool,129) -> Result<()> {130	let mut failed = false;131	// TODO: Lockfile, to prevent concurrent system switch?132	// TODO: If rollback target exists - bail, it should be removed. Lockfile will not work in case if rollback133	// is scheduler on next boot (default behavior). On current boot - rollback activator will fail due to134	// unit name conflict in systemd-run135	// This code is tied to rollback.nix136	if !disable_rollback && action.should_create_rollback_marker() {137		let _span = info_span!("preparing").entered();138		info!("preparing for rollback");139		let generation = get_current_generation(host).await?;140		info!(141			"rollback target would be {} {}",142			generation.id, generation.datetime143		);144		{145			let mut cmd = host.cmd("sh").await?;146			cmd.arg("-c").arg(format!("mark=$(mktemp -p /etc -t fleet_rollback_marker.XXXXX) && echo -n {} > $mark && mv --no-clobber $mark /etc/fleet_rollback_marker", generation.id));147			if let Err(e) = cmd.sudo().run().await {148				error!("failed to set rollback marker: {e}");149				failed = true;150			}151		}152		// Activation script also starts rollback-watchdog.timer, however, it is possible that it won't be started.153		// Kicking it on manually will work best.154		//155		// There wouldn't be conflict, because here we trigger start of the primary service, and systemd will156		// only allow one instance of it.157158		// TODO: We should also watch how this process is going.159		// After running this command, we have less than 3 minutes to deploy everything,160		// if we fail to perform generation switch in time, then we will still call the activation script, and this may break something.161		// Anyway, reboot will still help in this case.162		if action.should_schedule_rollback_run() {163			let mut cmd = host.cmd("systemd-run").await?;164			cmd.comparg("--on-active", "3min")165				.comparg("--unit", "rollback-watchdog-run")166				.arg("systemctl")167				.arg("start")168				.arg("rollback-watchdog.service");169			if let Err(e) = cmd.sudo().run().await {170				error!("failed to schedule rollback run: {e}");171				failed = true;172			}173		}174	}175176	if action.should_switch_profile() && !failed {177		info!("switching generation");178		let mut cmd = host.cmd("nix-env").await?;179		cmd.comparg("--profile", "/nix/var/nix/profiles/system")180			.comparg("--set", &built);181		if let Err(e) = cmd.sudo().run().await {182			error!("failed to switch generation: {e}");183			failed = true;184		}185	}186187	// FIXME: Connection might be disconnected after activation run188189	if action.should_activate() && !failed {190		let _span = info_span!("activating").entered();191		info!("executing activation script");192		let mut switch_script = built.clone();193		switch_script.push("bin");194		switch_script.push("switch-to-configuration");195		let mut cmd = host.cmd(switch_script).in_current_span().await?;196		cmd.arg(action.name().expect("upload.should_activate == false"));197		if let Err(e) = cmd.sudo().run().in_current_span().await {198			error!("failed to activate: {e}");199			failed = true;200		}201	}202	if action.should_create_rollback_marker() {203		if !disable_rollback {204			if failed {205				if action.should_schedule_rollback_run() {206					info!("executing rollback");207					if let Err(e) = host208						.systemctl_start("rollback-watchdog.service")209						.instrument(info_span!("rollback"))210						.await211					{212						error!("failed to trigger rollback: {e}")213					}214				}215			} else {216				info!("trying to mark upgrade as successful");217				if let Err(e) = host218					.rm_file("/etc/fleet_rollback_marker", true)219					.in_current_span()220					.await221				{222					error!("failed to remove rollback marker. This is bad, as the system will be rolled back by watchdog: {e}")223				}224			}225			info!("disarming watchdog, just in case");226			if let Err(_e) = host.systemctl_stop("rollback-watchdog.timer").await {227				// It is ok, if there was no reboot - then timer might not be running.228			}229			if action.should_schedule_rollback_run() {230				if let Err(e) = host.systemctl_stop("rollback-watchdog-run.timer").await {231					error!("failed to disarm rollback run: {e}");232				}233			}234		} else if let Err(_e) = host235			.rm_file("/etc/fleet_rollback_marker", true)236			.in_current_span()237			.await238		{239			// Marker might not exist, yet better try to remove it.240		}241	}242	Ok(())243}244245async fn build_task(config: Config, host: String, build_attr: &str) -> Result<PathBuf> {246	info!("building");247	let host = config.host(&host).await?;248	// let action = Action::from(self.subcommand.clone());249	let fleet_config = &config.config_field;250	let drv = nix_go!(251		fleet_config.hosts[{ &host.name }]252			.nixosSystem253			.config254			.system255			.build[{ build_attr }]256	);257	let outputs = drv.build().await.map_err(|e| {258			if build_attr == "sdImage" {259				info!("sd-image build failed");260				info!("Make sure you have imported modulesPath/installer/sd-card/sd-image-<arch>[-installer].nix (For installer, you may want to check config)");261			}262			e263		})?;264	let out_output = outputs265		.get("out")266		.ok_or_else(|| anyhow!("system build should produce \"out\" output"))?;267268	Ok(out_output.clone())269}270271impl BuildSystems {272	pub async fn run(self, config: &Config) -> Result<()> {273		let hosts = config.list_hosts().await?;274		let set = LocalSet::new();275		let build_attr = self.build_attr.clone();276		for host in hosts.into_iter() {277			if config.should_skip(&host.name) {278				continue;279			}280			let config = config.clone();281			let span = info_span!("build", host = field::display(&host.name));282			let hostname = host.name;283			let build_attr = build_attr.clone();284			// FIXME: Since the introduction of better-nix-eval,285			// due to single repl used for builds, hosts are waiting for each other to build,286			// instead of building concurrently.287			//288			// Open multiple repls?289			//290			// Create build batcher, which will behave similar to golangs291			// WaitGroup, and start executing once all the build tasks are scheduled?292			// This also allows to cleanup build output, as there will be no longer293			// "waiting for remote machine" messages in the cases when one package is needed for294			// multiple hosts.295			set.spawn_local(296				(async move {297					let built = match build_task(config, hostname.clone(), &build_attr).await {298						Ok(path) => path,299						Err(e) => {300							error!("failed to deploy host: {}", e);301							return;302						}303					};304					// TODO: Handle error305					let mut out = current_dir().expect("cwd exists");306					out.push(format!("built-{}", hostname));307308					info!("linking iso image to {:?}", out);309					if let Err(e) = symlink(built, out) {310						error!("failed to symlink: {e}")311					}312				})313				.instrument(span),314			);315		}316		set.await;317		Ok(())318	}319}320321impl Deploy {322	pub async fn run(self, config: &Config) -> Result<()> {323		let hosts = config.list_hosts().await?;324		let set = LocalSet::new();325		for host in hosts.into_iter() {326			if config.should_skip(&host.name) {327				continue;328			}329			let config = config.clone();330			let span = info_span!("deploy", host = field::display(&host.name));331			let hostname = host.name.clone();332			// FIXME: Fix repl concurrency (see build-systems)333			set.spawn_local(334				(async move {335					let built = match build_task(config.clone(), hostname.clone(), "toplevel").await336					{337						Ok(path) => path,338						Err(e) => {339							error!("failed to deploy host: {}", e);340							return;341						}342					};343					if !config.is_local(&hostname) {344						info!("uploading system closure");345						{346							// TODO: Move to remote_derivation method.347							// Alternatively, nix store make-content-addressed can be used,348							// at least for the first deployment, to provide trusted store key.349							//350							// It is much slower, yet doesn't require root on the deployer machine.351							let mut sign = MyCommand::new("nix");352							// Private key for host machine is registered in nix-sign.nix353							sign.arg("store")354								.arg("sign")355								.comparg("--key-file", "/etc/nix/private-key")356								.arg("-r")357								.arg(&built);358							if let Err(e) = sign.sudo().run_nix().await {359								warn!("Failed to sign store paths: {e}");360							};361						}362						let mut tries = 0;363						loop {364							match host.remote_derivation(&built).await {365								Ok(remote) => {366									assert!(remote == built, "CA derivations aren't implemented");367									break;368								}369								Err(e) if tries < 3 => {370									tries += 1;371									warn!("copy failure ({}/3): {}", tries, e);372									sleep(Duration::from_millis(5000)).await;373								}374								Err(e) => {375									error!("upload failed: {e}");376									return;377								}378							}379						}380					}381					if let Err(e) =382						deploy_task(self.action, &host, built, self.disable_rollback).await383					{384						error!("activation failed: {e}");385					}386				})387				.instrument(span),388			);389		}390		set.await;391		Ok(())392	}393}