git.delta.rocks / jrsonnet / refs/commits / deac38eb1c8f

difftreelog

fix do not prepare for rollback on upload

Yaroslav Bolyukin2024-01-05parent: #c0c9b96.patch.diff
in: trunk

1 file changed

modifiedcmds/fleet/src/cmds/build_systems.rsdiffbeforeafterboth
before · cmds/fleet/src/cmds/build_systems.rs
1use std::os::unix::fs::symlink;2use std::path::PathBuf;3use std::{env::current_dir, time::Duration};45use crate::command::MyCommand;6use crate::host::{Config, ConfigHost};7use crate::nix_go;8use anyhow::{anyhow, Result};9use clap::{Parser, ValueEnum};10use itertools::Itertools as _;11use tokio::{task::LocalSet, time::sleep};12use tracing::{error, field, info, info_span, warn, Instrument};1314#[derive(Parser)]15pub struct Deploy {16	/// Disable automatic rollback17	#[clap(long)]18	disable_rollback: bool,19	action: DeployAction,20}2122#[derive(ValueEnum, Clone, Copy)]23enum DeployAction {24	/// Upload derivation, but do not execute the update.25	Upload,26	/// Upload and execute the activation script, old version will be used after reboot.27	Test,28	/// Upload and set as current system profile, but do not execute activation script.29	Boot,30	/// Upload, set current profile, and execute activation script.31	Switch,32}3334impl DeployAction {35	pub(crate) fn name(&self) -> Option<&'static str> {36		match self {37			DeployAction::Upload => None,38			DeployAction::Test => Some("test"),39			DeployAction::Boot => Some("boot"),40			DeployAction::Switch => Some("switch"),41		}42	}43	pub(crate) fn should_switch_profile(&self) -> bool {44		matches!(self, Self::Switch | Self::Boot)45	}46	pub(crate) fn should_activate(&self) -> bool {47		matches!(self, Self::Switch | Self::Test)48	}49	pub(crate) fn should_schedule_rollback_run(&self) -> bool {50		matches!(self, Self::Switch | Self::Test)51	}52}5354#[derive(Parser, Clone)]55pub struct BuildSystems {56	/// Attribute to build. Systems are deployed from "toplevel" attr, well-known used attributes57	/// are "sdImage"/"isoImage", and your configuration may include any other build attributes.58	#[clap(long, default_value = "toplevel")]59	build_attr: String,60}6162struct Generation {63	id: u32,64	current: bool,65	datetime: String,66}67async fn get_current_generation(host: &ConfigHost) -> Result<Generation> {68	let mut cmd = host.cmd("nix-env").await?;69	cmd.comparg("--profile", "/nix/var/nix/profiles/system")70		.arg("--list-generations");71	// Sudo is required due to --list-generations acquiring lock on the profile.72	let data = cmd.sudo().run_string().await?;73	let generations = data74		.split('\n')75		.map(|e| e.trim())76		.filter(|&l| !l.is_empty())77		.filter_map(|g| {78			let gen: Option<Generation> = try {79				let mut parts = g.split_whitespace();80				let id = parts.next()?;81				let id: u32 = id.parse().ok()?;82				let date = parts.next()?;83				let time = parts.next()?;84				let current = if let Some(current) = parts.next() {85					if current == "(current)" {86						Some(true)87					} else {88						None89					}90				} else {91					Some(false)92				};93				let current = current?;94				if parts.next().is_some() {95					warn!("unexpected text after generation: {g}");96				}97				Generation {98					id,99					current,100					datetime: format!("{date} {time}"),101				}102			};103			if gen.is_none() {104				warn!("bad generation: {g}")105			}106			gen107		})108		.collect::<Vec<_>>();109	let current = generations110		.into_iter()111		.filter(|g| g.current)112		.at_most_one()113		.map_err(|_e| anyhow!("bad list-generations output"))?114		.ok_or_else(|| anyhow!("failed to find generation"))?;115	Ok(current)116}117118async fn deploy_task(119	action: DeployAction,120	host: &ConfigHost,121	built: PathBuf,122	disable_rollback: bool,123) -> Result<()> {124	let mut failed = false;125	// TODO: Lockfile, to prevent concurrent system switch?126	// TODO: If rollback target exists - bail, it should be removed. Lockfile will not work in case if rollback127	// is scheduler on next boot (default behavior). On current boot - rollback activator will fail due to128	// unit name conflict in systemd-run129	// This code is tied to rollback.nix130	if !disable_rollback {131		let _span = info_span!("preparing").entered();132		info!("preparing for rollback");133		let generation = get_current_generation(host).await?;134		info!(135			"rollback target would be {} {}",136			generation.id, generation.datetime137		);138		{139			let mut cmd = host.cmd("sh").await?;140			cmd.arg("-c").arg(format!("mark=$(mktemp -p /etc -t fleet_rollback_marker.XXXXX) && echo -n {} > $mark && mv --no-clobber $mark /etc/fleet_rollback_marker", generation.id));141			if let Err(e) = cmd.sudo().run().await {142				error!("failed to set rollback marker: {e}");143				failed = true;144			}145		}146		// Activation script also starts rollback-watchdog.timer, however, it is possible that it won't be started.147		// Kicking it on manually will work best.148		//149		// There wouldn't be conflict, because here we trigger start of the primary service, and systemd will150		// only allow one instance of it.151152		// TODO: We should also watch how this process is going.153		// After running this command, we have less than 3 minutes to deploy everything,154		// if we fail to perform generation switch in time, then we will still call the activation script, and this may break something.155		// Anyway, reboot will still help in this case.156		if action.should_schedule_rollback_run() {157			let mut cmd = host.cmd("systemd-run").await?;158			cmd.comparg("--on-active", "3min")159				.comparg("--unit", "rollback-watchdog-run")160				.arg("systemctl")161				.arg("start")162				.arg("rollback-watchdog.service");163			if let Err(e) = cmd.sudo().run().await {164				error!("failed to schedule rollback run: {e}");165				failed = true;166			}167		}168	}169170	if action.should_switch_profile() && !failed {171		info!("switching generation");172		let mut cmd = host.cmd("nix-env").await?;173		cmd.comparg("--profile", "/nix/var/nix/profiles/system")174			.comparg("--set", &built);175		if let Err(e) = cmd.sudo().run().await {176			error!("failed to switch generation: {e}");177			failed = true;178		}179	}180181	// FIXME: Connection might be disconnected after activation run182183	if action.should_activate() && !failed {184		let _span = info_span!("activating").entered();185		info!("executing activation script");186		let mut switch_script = built.clone();187		switch_script.push("bin");188		switch_script.push("switch-to-configuration");189		let mut cmd = host.cmd(switch_script).in_current_span().await?;190		cmd.arg(action.name().expect("upload.should_activate == false"));191		if let Err(e) = cmd.sudo().run().in_current_span().await {192			error!("failed to activate: {e}");193			failed = true;194		}195	}196	if !disable_rollback {197		if failed {198			info!("executing rollback");199			if let Err(e) = host200				.systemctl_start("rollback-watchdog.service")201				.instrument(info_span!("rollback"))202				.await203			{204				error!("failed to trigger rollback: {e}")205			}206		} else {207			info!("trying to mark upgrade as successful");208			if let Err(e) = host209				.rm_file("/etc/fleet_rollback_marker", true)210				.in_current_span()211				.await212			{213				error!("failed to remove rollback marker. This is bad, as the system will be rolled back by watchdog: {e}")214			}215		}216		info!("disarming watchdog, just in case");217		if let Err(_e) = host.systemctl_stop("rollback-watchdog.timer").await {218			// It is ok, if there was no reboot - then timer might not be running.219		}220		if action.should_schedule_rollback_run() {221			if let Err(e) = host.systemctl_stop("rollback-watchdog-run.timer").await {222				error!("failed to disarm rollback run: {e}");223			}224		}225	} else if let Err(_e) = host226		.rm_file("/etc/fleet_rollback_marker", true)227		.in_current_span()228		.await229	{230		// Marker might not exist, yet better try to remove it.231	}232	Ok(())233}234235async fn build_task(config: Config, host: String, build_attr: &str) -> Result<PathBuf> {236	info!("building");237	let host = config.host(&host).await?;238	// let action = Action::from(self.subcommand.clone());239	let fleet_config = &config.config_field;240	let drv = nix_go!(241		fleet_config.hosts[{ &host.name }]242			.nixosSystem243			.config244			.system245			.build[{ build_attr }]246	);247	let outputs = drv.build().await.map_err(|e| {248			if build_attr == "sdImage" {249				info!("sd-image build failed");250				info!("Make sure you have imported modulesPath/installer/sd-card/sd-image-<arch>[-installer].nix (For installer, you may want to check config)");251			}252			e253		})?;254	let out_output = outputs255		.get("out")256		.ok_or_else(|| anyhow!("system build should produce \"out\" output"))?;257258	Ok(out_output.clone())259}260261impl BuildSystems {262	pub async fn run(self, config: &Config) -> Result<()> {263		let hosts = config.list_hosts().await?;264		let set = LocalSet::new();265		let build_attr = self.build_attr.clone();266		for host in hosts.into_iter() {267			if config.should_skip(&host.name) {268				continue;269			}270			let config = config.clone();271			let span = info_span!("build", host = field::display(&host.name));272			let hostname = host.name;273			let build_attr = build_attr.clone();274			// FIXME: Since the introduction of better-nix-eval,275			// due to single repl used for builds, hosts are waiting for each other to build,276			// instead of building concurrently.277			//278			// Open multiple repls?279			//280			// Create build batcher, which will behave similar to golangs281			// WaitGroup, and start executing once all the build tasks are scheduled?282			// This also allows to cleanup build output, as there will be no longer283			// "waiting for remote machine" messages in the cases when one package is needed for284			// multiple hosts.285			set.spawn_local(286				(async move {287					let built = match build_task(config, hostname.clone(), &build_attr).await {288						Ok(path) => path,289						Err(e) => {290							error!("failed to deploy host: {}", e);291							return;292						}293					};294					// TODO: Handle error295					let mut out = current_dir().expect("cwd exists");296					out.push(format!("built-{}", hostname));297298					info!("linking iso image to {:?}", out);299					if let Err(e) = symlink(built, out) {300						error!("failed to symlink: {e}")301					}302				})303				.instrument(span),304			);305		}306		set.await;307		Ok(())308	}309}310311impl Deploy {312	pub async fn run(self, config: &Config) -> Result<()> {313		let hosts = config.list_hosts().await?;314		let set = LocalSet::new();315		for host in hosts.into_iter() {316			if config.should_skip(&host.name) {317				continue;318			}319			let config = config.clone();320			let span = info_span!("deploy", host = field::display(&host.name));321			let hostname = host.name.clone();322			// FIXME: Fix repl concurrency (see build-systems)323			set.spawn_local(324				(async move {325					let built = match build_task(config.clone(), hostname.clone(), "toplevel").await326					{327						Ok(path) => path,328						Err(e) => {329							error!("failed to deploy host: {}", e);330							return;331						}332					};333					if !config.is_local(&hostname) {334						info!("uploading system closure");335						{336							// TODO: Move to remote_derivation method.337							// Alternatively, nix store make-content-addressed can be used,338							// at least for the first deployment, to provide trusted store key.339							//340							// It is much slower, yet doesn't require root on the deployer machine.341							let mut sign = MyCommand::new("nix");342							// Private key for host machine is registered in nix-sign.nix343							sign.arg("store")344								.arg("sign")345								.comparg("--key-file", "/etc/nix/private-key")346								.arg("-r")347								.arg(&built);348							if let Err(e) = sign.sudo().run_nix().await {349								warn!("Failed to sign store paths: {e}");350							};351						}352						let mut tries = 0;353						loop {354							match host.remote_derivation(&built).await {355								Ok(remote) => {356									assert!(remote == built, "CA derivations aren't implemented");357									break;358								}359								Err(e) if tries < 3 => {360									tries += 1;361									warn!("copy failure ({}/3): {}", tries, e);362									sleep(Duration::from_millis(5000)).await;363								}364								Err(e) => {365									error!("upload failed: {e}");366									return;367								}368							}369						}370					}371					if let Err(e) =372						deploy_task(self.action, &host, built, self.disable_rollback).await373					{374						error!("activation failed: {e}");375					}376				})377				.instrument(span),378			);379		}380		set.await;381		Ok(())382	}383}