git.delta.rocks / jrsonnet / refs/commits / deac38eb1c8f

difftreelog

fix do not prepare for rollback on upload

Yaroslav Bolyukin2024-01-05parent: #c0c9b96.patch.diff
in: trunk

1 file changed

modifiedcmds/fleet/src/cmds/build_systems.rsdiffbeforeafterboth
before · cmds/fleet/src/cmds/build_systems.rs
1use std::os::unix::fs::symlink;2use std::path::PathBuf;3use std::{env::current_dir, time::Duration};45use crate::command::MyCommand;6use crate::host::{Config, ConfigHost};7use crate::nix_go;8use anyhow::{anyhow, Result};9use clap::{Parser, ValueEnum};10use itertools::Itertools as _;11use tokio::{task::LocalSet, time::sleep};12use tracing::{error, field, info, info_span, warn, Instrument};1314#[derive(Parser)]15pub struct Deploy {16	/// Disable automatic rollback17	#[clap(long)]18	disable_rollback: bool,19	action: DeployAction,20}2122#[derive(ValueEnum, Clone, Copy)]23enum DeployAction {24	/// Upload derivation, but do not execute the update.25	Upload,26	/// Upload and execute the activation script, old version will be used after reboot.27	Test,28	/// Upload and set as current system profile, but do not execute activation script.29	Boot,30	/// Upload, set current profile, and execute activation script.31	Switch,32}3334impl DeployAction {35	pub(crate) fn name(&self) -> Option<&'static str> {36		match self {37			DeployAction::Upload => None,38			DeployAction::Test => Some("test"),39			DeployAction::Boot => Some("boot"),40			DeployAction::Switch => Some("switch"),41		}42	}43	pub(crate) fn should_switch_profile(&self) -> bool {44		matches!(self, Self::Switch | Self::Boot)45	}46	pub(crate) fn should_activate(&self) -> bool {47		matches!(self, Self::Switch | Self::Test)48	}49	pub(crate) fn should_schedule_rollback_run(&self) -> bool {50		matches!(self, Self::Switch | Self::Test)51	}52}5354#[derive(Parser, Clone)]55pub struct BuildSystems {56	/// Attribute to build. Systems are deployed from "toplevel" attr, well-known used attributes57	/// are "sdImage"/"isoImage", and your configuration may include any other build attributes.58	#[clap(long, default_value = "toplevel")]59	build_attr: String,60}6162struct Generation {63	id: u32,64	current: bool,65	datetime: String,66}67async fn get_current_generation(host: &ConfigHost) -> Result<Generation> {68	let mut cmd = host.cmd("nix-env").await?;69	cmd.comparg("--profile", "/nix/var/nix/profiles/system")70		.arg("--list-generations");71	// Sudo is required due to --list-generations acquiring lock on the profile.72	let data = cmd.sudo().run_string().await?;73	let generations = data74		.split('\n')75		.map(|e| e.trim())76		.filter(|&l| !l.is_empty())77		.filter_map(|g| {78			let gen: Option<Generation> = try {79				let mut parts = g.split_whitespace();80				let id = parts.next()?;81				let id: u32 = id.parse().ok()?;82				let date = parts.next()?;83				let time = parts.next()?;84				let current = if let Some(current) = parts.next() {85					if current == "(current)" {86						Some(true)87					} else {88						None89					}90				} else {91					Some(false)92				};93				let current = current?;94				if parts.next().is_some() {95					warn!("unexpected text after generation: {g}");96				}97				Generation {98					id,99					current,100					datetime: format!("{date} {time}"),101				}102			};103			if gen.is_none() {104				warn!("bad generation: {g}")105			}106			gen107		})108		.collect::<Vec<_>>();109	let current = generations110		.into_iter()111		.filter(|g| g.current)112		.at_most_one()113		.map_err(|_e| anyhow!("bad list-generations output"))?114		.ok_or_else(|| anyhow!("failed to find generation"))?;115	Ok(current)116}117118async fn deploy_task(119	action: DeployAction,120	host: &ConfigHost,121	built: PathBuf,122	disable_rollback: bool,123) -> Result<()> {124	let mut failed = false;125	// TODO: Lockfile, to prevent concurrent system switch?126	// TODO: If rollback target exists - bail, it should be removed. Lockfile will not work in case if rollback127	// is scheduler on next boot (default behavior). On current boot - rollback activator will fail due to128	// unit name conflict in systemd-run129	// This code is tied to rollback.nix130	if !disable_rollback {131		let _span = info_span!("preparing").entered();132		info!("preparing for rollback");133		let generation = get_current_generation(host).await?;134		info!(135			"rollback target would be {} {}",136			generation.id, generation.datetime137		);138		{139			let mut cmd = host.cmd("sh").await?;140			cmd.arg("-c").arg(format!("mark=$(mktemp -p /etc -t fleet_rollback_marker.XXXXX) && echo -n {} > $mark && mv --no-clobber $mark /etc/fleet_rollback_marker", generation.id));141			if let Err(e) = cmd.sudo().run().await {142				error!("failed to set rollback marker: {e}");143				failed = true;144			}145		}146		// Activation script also starts rollback-watchdog.timer, however, it is possible that it won't be started.147		// Kicking it on manually will work best.148		//149		// There wouldn't be conflict, because here we trigger start of the primary service, and systemd will150		// only allow one instance of it.151152		// TODO: We should also watch how this process is going.153		// After running this command, we have less than 3 minutes to deploy everything,154		// if we fail to perform generation switch in time, then we will still call the activation script, and this may break something.155		// Anyway, reboot will still help in this case.156		if action.should_schedule_rollback_run() {157			let mut cmd = host.cmd("systemd-run").await?;158			cmd.comparg("--on-active", "3min")159				.comparg("--unit", "rollback-watchdog-run")160				.arg("systemctl")161				.arg("start")162				.arg("rollback-watchdog.service");163			if let Err(e) = cmd.sudo().run().await {164				error!("failed to schedule rollback run: {e}");165				failed = true;166			}167		}168	}169170	if action.should_switch_profile() && !failed {171		info!("switching generation");172		let mut cmd = host.cmd("nix-env").await?;173		cmd.comparg("--profile", "/nix/var/nix/profiles/system")174			.comparg("--set", &built);175		if let Err(e) = cmd.sudo().run().await {176			error!("failed to switch generation: {e}");177			failed = true;178		}179	}180181	// FIXME: Connection might be disconnected after activation run182183	if action.should_activate() && !failed {184		let _span = info_span!("activating").entered();185		info!("executing activation script");186		let mut switch_script = built.clone();187		switch_script.push("bin");188		switch_script.push("switch-to-configuration");189		let mut cmd = host.cmd(switch_script).in_current_span().await?;190		cmd.arg(action.name().expect("upload.should_activate == false"));191		if let Err(e) = cmd.sudo().run().in_current_span().await {192			error!("failed to activate: {e}");193			failed = true;194		}195	}196	if !disable_rollback {197		if failed {198			info!("executing rollback");199			if let Err(e) = host200				.systemctl_start("rollback-watchdog.service")201				.instrument(info_span!("rollback"))202				.await203			{204				error!("failed to trigger rollback: {e}")205			}206		} else {207			info!("trying to mark upgrade as successful");208			if let Err(e) = host209				.rm_file("/etc/fleet_rollback_marker", true)210				.in_current_span()211				.await212			{213				error!("failed to remove rollback marker. This is bad, as the system will be rolled back by watchdog: {e}")214			}215		}216		info!("disarming watchdog, just in case");217		if let Err(_e) = host.systemctl_stop("rollback-watchdog.timer").await {218			// It is ok, if there was no reboot - then timer might not be running.219		}220		if action.should_schedule_rollback_run() {221			if let Err(e) = host.systemctl_stop("rollback-watchdog-run.timer").await {222				error!("failed to disarm rollback run: {e}");223			}224		}225	} else if let Err(_e) = host226		.rm_file("/etc/fleet_rollback_marker", true)227		.in_current_span()228		.await229	{230		// Marker might not exist, yet better try to remove it.231	}232	Ok(())233}234235async fn build_task(config: Config, host: String, build_attr: &str) -> Result<PathBuf> {236	info!("building");237	let host = config.host(&host).await?;238	// let action = Action::from(self.subcommand.clone());239	let fleet_config = &config.config_field;240	let drv = nix_go!(241		fleet_config.hosts[{ &host.name }]242			.nixosSystem243			.config244			.system245			.build[{ build_attr }]246	);247	let outputs = drv.build().await.map_err(|e| {248			if build_attr == "sdImage" {249				info!("sd-image build failed");250				info!("Make sure you have imported modulesPath/installer/sd-card/sd-image-<arch>[-installer].nix (For installer, you may want to check config)");251			}252			e253		})?;254	let out_output = outputs255		.get("out")256		.ok_or_else(|| anyhow!("system build should produce \"out\" output"))?;257258	Ok(out_output.clone())259}260261impl BuildSystems {262	pub async fn run(self, config: &Config) -> Result<()> {263		let hosts = config.list_hosts().await?;264		let set = LocalSet::new();265		let build_attr = self.build_attr.clone();266		for host in hosts.into_iter() {267			if config.should_skip(&host.name) {268				continue;269			}270			let config = config.clone();271			let span = info_span!("build", host = field::display(&host.name));272			let hostname = host.name;273			let build_attr = build_attr.clone();274			// FIXME: Since the introduction of better-nix-eval,275			// due to single repl used for builds, hosts are waiting for each other to build,276			// instead of building concurrently.277			//278			// Open multiple repls?279			//280			// Create build batcher, which will behave similar to golangs281			// WaitGroup, and start executing once all the build tasks are scheduled?282			// This also allows to cleanup build output, as there will be no longer283			// "waiting for remote machine" messages in the cases when one package is needed for284			// multiple hosts.285			set.spawn_local(286				(async move {287					let built = match build_task(config, hostname.clone(), &build_attr).await {288						Ok(path) => path,289						Err(e) => {290							error!("failed to deploy host: {}", e);291							return;292						}293					};294					// TODO: Handle error295					let mut out = current_dir().expect("cwd exists");296					out.push(format!("built-{}", hostname));297298					info!("linking iso image to {:?}", out);299					if let Err(e) = symlink(built, out) {300						error!("failed to symlink: {e}")301					}302				})303				.instrument(span),304			);305		}306		set.await;307		Ok(())308	}309}310311impl Deploy {312	pub async fn run(self, config: &Config) -> Result<()> {313		let hosts = config.list_hosts().await?;314		let set = LocalSet::new();315		for host in hosts.into_iter() {316			if config.should_skip(&host.name) {317				continue;318			}319			let config = config.clone();320			let span = info_span!("deploy", host = field::display(&host.name));321			let hostname = host.name.clone();322			// FIXME: Fix repl concurrency (see build-systems)323			set.spawn_local(324				(async move {325					let built = match build_task(config.clone(), hostname.clone(), "toplevel").await326					{327						Ok(path) => path,328						Err(e) => {329							error!("failed to deploy host: {}", e);330							return;331						}332					};333					if !config.is_local(&hostname) {334						info!("uploading system closure");335						{336							// TODO: Move to remote_derivation method.337							// Alternatively, nix store make-content-addressed can be used,338							// at least for the first deployment, to provide trusted store key.339							//340							// It is much slower, yet doesn't require root on the deployer machine.341							let mut sign = MyCommand::new("nix");342							// Private key for host machine is registered in nix-sign.nix343							sign.arg("store")344								.arg("sign")345								.comparg("--key-file", "/etc/nix/private-key")346								.arg("-r")347								.arg(&built);348							if let Err(e) = sign.sudo().run_nix().await {349								warn!("Failed to sign store paths: {e}");350							};351						}352						let mut tries = 0;353						loop {354							match host.remote_derivation(&built).await {355								Ok(remote) => {356									assert!(remote == built, "CA derivations aren't implemented");357									break;358								}359								Err(e) if tries < 3 => {360									tries += 1;361									warn!("copy failure ({}/3): {}", tries, e);362									sleep(Duration::from_millis(5000)).await;363								}364								Err(e) => {365									error!("upload failed: {e}");366									return;367								}368							}369						}370					}371					if let Err(e) =372						deploy_task(self.action, &host, built, self.disable_rollback).await373					{374						error!("activation failed: {e}");375					}376				})377				.instrument(span),378			);379		}380		set.await;381		Ok(())382	}383}
after · cmds/fleet/src/cmds/build_systems.rs
1use std::os::unix::fs::symlink;2use std::path::PathBuf;3use std::{env::current_dir, time::Duration};45use crate::command::MyCommand;6use crate::host::{Config, ConfigHost};7use crate::nix_go;8use anyhow::{anyhow, Result};9use clap::{Parser, ValueEnum};10use itertools::Itertools as _;11use tokio::{task::LocalSet, time::sleep};12use tracing::{error, field, info, info_span, warn, Instrument};1314#[derive(Parser)]15pub struct Deploy {16	/// Disable automatic rollback17	#[clap(long)]18	disable_rollback: bool,19	/// Action to execute after system is built20	action: DeployAction,21}2223#[derive(ValueEnum, Clone, Copy)]24enum DeployAction {25	/// Upload derivation, but do not execute the update.26	Upload,27	/// Upload and execute the activation script, old version will be used after reboot.28	Test,29	/// Upload and set as current system profile, but do not execute activation script.30	Boot,31	/// Upload, set current profile, and execute activation script.32	Switch,33}3435impl DeployAction {36	pub(crate) fn name(&self) -> Option<&'static str> {37		match self {38			DeployAction::Upload => None,39			DeployAction::Test => Some("test"),40			DeployAction::Boot => Some("boot"),41			DeployAction::Switch => Some("switch"),42		}43	}44	pub(crate) fn should_switch_profile(&self) -> bool {45		matches!(self, Self::Switch | Self::Boot)46	}47	pub(crate) fn should_activate(&self) -> bool {48		matches!(self, Self::Switch | Self::Test)49	}50	pub(crate) fn should_create_rollback_marker(&self) -> bool {51		// Upload does nothing on the target machine, other than uploading the closure.52		// In boot case we want to have rollback marker prepared, so that the system may rollback itself on the next boot.53		!matches!(self, Self::Upload)54	}55	pub(crate) fn should_schedule_rollback_run(&self) -> bool {56		matches!(self, Self::Switch | Self::Test)57	}58}5960#[derive(Parser, Clone)]61pub struct BuildSystems {62	/// Attribute to build. Systems are deployed from "toplevel" attr, well-known used attributes63	/// are "sdImage"/"isoImage", and your configuration may include any other build attributes.64	#[clap(long, default_value = "toplevel")]65	build_attr: String,66}6768struct Generation {69	id: u32,70	current: bool,71	datetime: String,72}73async fn get_current_generation(host: &ConfigHost) -> Result<Generation> {74	let mut cmd = host.cmd("nix-env").await?;75	cmd.comparg("--profile", "/nix/var/nix/profiles/system")76		.arg("--list-generations");77	// Sudo is required due to --list-generations acquiring lock on the profile.78	let data = cmd.sudo().run_string().await?;79	let generations = data80		.split('\n')81		.map(|e| e.trim())82		.filter(|&l| !l.is_empty())83		.filter_map(|g| {84			let gen: Option<Generation> = try {85				let mut parts = g.split_whitespace();86				let id = parts.next()?;87				let id: u32 = id.parse().ok()?;88				let date = parts.next()?;89				let time = parts.next()?;90				let current = if let Some(current) = parts.next() {91					if current == "(current)" {92						Some(true)93					} else {94						None95					}96				} else {97					Some(false)98				};99				let current = current?;100				if parts.next().is_some() {101					warn!("unexpected text after generation: {g}");102				}103				Generation {104					id,105					current,106					datetime: format!("{date} {time}"),107				}108			};109			if gen.is_none() {110				warn!("bad generation: {g}")111			}112			gen113		})114		.collect::<Vec<_>>();115	let current = generations116		.into_iter()117		.filter(|g| g.current)118		.at_most_one()119		.map_err(|_e| anyhow!("bad list-generations output"))?120		.ok_or_else(|| anyhow!("failed to find generation"))?;121	Ok(current)122}123124async fn deploy_task(125	action: DeployAction,126	host: &ConfigHost,127	built: PathBuf,128	disable_rollback: bool,129) -> Result<()> {130	let mut failed = false;131	// TODO: Lockfile, to prevent concurrent system switch?132	// TODO: If rollback target exists - bail, it should be removed. Lockfile will not work in case if rollback133	// is scheduler on next boot (default behavior). On current boot - rollback activator will fail due to134	// unit name conflict in systemd-run135	// This code is tied to rollback.nix136	if !disable_rollback && action.should_create_rollback_marker() {137		let _span = info_span!("preparing").entered();138		info!("preparing for rollback");139		let generation = get_current_generation(host).await?;140		info!(141			"rollback target would be {} {}",142			generation.id, generation.datetime143		);144		{145			let mut cmd = host.cmd("sh").await?;146			cmd.arg("-c").arg(format!("mark=$(mktemp -p /etc -t fleet_rollback_marker.XXXXX) && echo -n {} > $mark && mv --no-clobber $mark /etc/fleet_rollback_marker", generation.id));147			if let Err(e) = cmd.sudo().run().await {148				error!("failed to set rollback marker: {e}");149				failed = true;150			}151		}152		// Activation script also starts rollback-watchdog.timer, however, it is possible that it won't be started.153		// Kicking it on manually will work best.154		//155		// There wouldn't be conflict, because here we trigger start of the primary service, and systemd will156		// only allow one instance of it.157158		// TODO: We should also watch how this process is going.159		// After running this command, we have less than 3 minutes to deploy everything,160		// if we fail to perform generation switch in time, then we will still call the activation script, and this may break something.161		// Anyway, reboot will still help in this case.162		if action.should_schedule_rollback_run() {163			let mut cmd = host.cmd("systemd-run").await?;164			cmd.comparg("--on-active", "3min")165				.comparg("--unit", "rollback-watchdog-run")166				.arg("systemctl")167				.arg("start")168				.arg("rollback-watchdog.service");169			if let Err(e) = cmd.sudo().run().await {170				error!("failed to schedule rollback run: {e}");171				failed = true;172			}173		}174	}175176	if action.should_switch_profile() && !failed {177		info!("switching generation");178		let mut cmd = host.cmd("nix-env").await?;179		cmd.comparg("--profile", "/nix/var/nix/profiles/system")180			.comparg("--set", &built);181		if let Err(e) = cmd.sudo().run().await {182			error!("failed to switch generation: {e}");183			failed = true;184		}185	}186187	// FIXME: Connection might be disconnected after activation run188189	if action.should_activate() && !failed {190		let _span = info_span!("activating").entered();191		info!("executing activation script");192		let mut switch_script = built.clone();193		switch_script.push("bin");194		switch_script.push("switch-to-configuration");195		let mut cmd = host.cmd(switch_script).in_current_span().await?;196		cmd.arg(action.name().expect("upload.should_activate == false"));197		if let Err(e) = cmd.sudo().run().in_current_span().await {198			error!("failed to activate: {e}");199			failed = true;200		}201	}202	if action.should_create_rollback_marker() {203		if !disable_rollback {204			if failed {205				if action.should_schedule_rollback_run() {206					info!("executing rollback");207					if let Err(e) = host208						.systemctl_start("rollback-watchdog.service")209						.instrument(info_span!("rollback"))210						.await211					{212						error!("failed to trigger rollback: {e}")213					}214				}215			} else {216				info!("trying to mark upgrade as successful");217				if let Err(e) = host218					.rm_file("/etc/fleet_rollback_marker", true)219					.in_current_span()220					.await221				{222					error!("failed to remove rollback marker. This is bad, as the system will be rolled back by watchdog: {e}")223				}224			}225			info!("disarming watchdog, just in case");226			if let Err(_e) = host.systemctl_stop("rollback-watchdog.timer").await {227				// It is ok, if there was no reboot - then timer might not be running.228			}229			if action.should_schedule_rollback_run() {230				if let Err(e) = host.systemctl_stop("rollback-watchdog-run.timer").await {231					error!("failed to disarm rollback run: {e}");232				}233			}234		} else if let Err(_e) = host235			.rm_file("/etc/fleet_rollback_marker", true)236			.in_current_span()237			.await238		{239			// Marker might not exist, yet better try to remove it.240		}241	}242	Ok(())243}244245async fn build_task(config: Config, host: String, build_attr: &str) -> Result<PathBuf> {246	info!("building");247	let host = config.host(&host).await?;248	// let action = Action::from(self.subcommand.clone());249	let fleet_config = &config.config_field;250	let drv = nix_go!(251		fleet_config.hosts[{ &host.name }]252			.nixosSystem253			.config254			.system255			.build[{ build_attr }]256	);257	let outputs = drv.build().await.map_err(|e| {258			if build_attr == "sdImage" {259				info!("sd-image build failed");260				info!("Make sure you have imported modulesPath/installer/sd-card/sd-image-<arch>[-installer].nix (For installer, you may want to check config)");261			}262			e263		})?;264	let out_output = outputs265		.get("out")266		.ok_or_else(|| anyhow!("system build should produce \"out\" output"))?;267268	Ok(out_output.clone())269}270271impl BuildSystems {272	pub async fn run(self, config: &Config) -> Result<()> {273		let hosts = config.list_hosts().await?;274		let set = LocalSet::new();275		let build_attr = self.build_attr.clone();276		for host in hosts.into_iter() {277			if config.should_skip(&host.name) {278				continue;279			}280			let config = config.clone();281			let span = info_span!("build", host = field::display(&host.name));282			let hostname = host.name;283			let build_attr = build_attr.clone();284			// FIXME: Since the introduction of better-nix-eval,285			// due to single repl used for builds, hosts are waiting for each other to build,286			// instead of building concurrently.287			//288			// Open multiple repls?289			//290			// Create build batcher, which will behave similar to golangs291			// WaitGroup, and start executing once all the build tasks are scheduled?292			// This also allows to cleanup build output, as there will be no longer293			// "waiting for remote machine" messages in the cases when one package is needed for294			// multiple hosts.295			set.spawn_local(296				(async move {297					let built = match build_task(config, hostname.clone(), &build_attr).await {298						Ok(path) => path,299						Err(e) => {300							error!("failed to deploy host: {}", e);301							return;302						}303					};304					// TODO: Handle error305					let mut out = current_dir().expect("cwd exists");306					out.push(format!("built-{}", hostname));307308					info!("linking iso image to {:?}", out);309					if let Err(e) = symlink(built, out) {310						error!("failed to symlink: {e}")311					}312				})313				.instrument(span),314			);315		}316		set.await;317		Ok(())318	}319}320321impl Deploy {322	pub async fn run(self, config: &Config) -> Result<()> {323		let hosts = config.list_hosts().await?;324		let set = LocalSet::new();325		for host in hosts.into_iter() {326			if config.should_skip(&host.name) {327				continue;328			}329			let config = config.clone();330			let span = info_span!("deploy", host = field::display(&host.name));331			let hostname = host.name.clone();332			// FIXME: Fix repl concurrency (see build-systems)333			set.spawn_local(334				(async move {335					let built = match build_task(config.clone(), hostname.clone(), "toplevel").await336					{337						Ok(path) => path,338						Err(e) => {339							error!("failed to deploy host: {}", e);340							return;341						}342					};343					if !config.is_local(&hostname) {344						info!("uploading system closure");345						{346							// TODO: Move to remote_derivation method.347							// Alternatively, nix store make-content-addressed can be used,348							// at least for the first deployment, to provide trusted store key.349							//350							// It is much slower, yet doesn't require root on the deployer machine.351							let mut sign = MyCommand::new("nix");352							// Private key for host machine is registered in nix-sign.nix353							sign.arg("store")354								.arg("sign")355								.comparg("--key-file", "/etc/nix/private-key")356								.arg("-r")357								.arg(&built);358							if let Err(e) = sign.sudo().run_nix().await {359								warn!("Failed to sign store paths: {e}");360							};361						}362						let mut tries = 0;363						loop {364							match host.remote_derivation(&built).await {365								Ok(remote) => {366									assert!(remote == built, "CA derivations aren't implemented");367									break;368								}369								Err(e) if tries < 3 => {370									tries += 1;371									warn!("copy failure ({}/3): {}", tries, e);372									sleep(Duration::from_millis(5000)).await;373								}374								Err(e) => {375									error!("upload failed: {e}");376									return;377								}378							}379						}380					}381					if let Err(e) =382						deploy_task(self.action, &host, built, self.disable_rollback).await383					{384						error!("activation failed: {e}");385					}386				})387				.instrument(span),388			);389		}390		set.await;391		Ok(())392	}393}