git.delta.rocks / jrsonnet / refs/commits / 1aab6a2e63b6

difftreelog

source

cmds/fleet/src/cmds/build_systems.rs11.5 KiBsourcehistory
1use std::os::unix::fs::symlink;2use std::path::PathBuf;3use std::{env::current_dir, time::Duration};45use crate::command::MyCommand;6use crate::host::Config;7use crate::nix_path;8use anyhow::{anyhow, Result, Context};9use clap::Parser;10use itertools::Itertools;11use tokio::{task::LocalSet, time::sleep};12use tracing::{error, field, info, info_span, warn, Instrument};1314#[derive(Parser, Clone)]15pub struct BuildSystems {16	/// Disable automatic rollback17	#[clap(long)]18	disable_rollback: bool,19	#[clap(subcommand)]20	subcommand: Subcommand,21}2223enum UploadAction {24	Test,25	Boot,26	Switch,27}28impl UploadAction {29	fn name(&self) -> &'static str {30		match self {31			UploadAction::Test => "test",32			UploadAction::Boot => "boot",33			UploadAction::Switch => "switch",34		}35	}3637	pub(crate) fn should_switch_profile(&self) -> bool {38		matches!(self, Self::Switch | Self::Boot)39	}40	pub(crate) fn should_activate(&self) -> bool {41		matches!(self, Self::Switch | Self::Test)42	}43	pub(crate) fn should_schedule_rollback_run(&self) -> bool {44		matches!(self, Self::Switch | Self::Test)45	}46}4748enum PackageAction {49	SdImage,50	InstallationCd,51}52impl PackageAction {53	fn build_attr(&self) -> String {54		match self {55			PackageAction::SdImage => "sdImage".to_owned(),56			PackageAction::InstallationCd => "installationCd".to_owned(),57		}58	}59}6061enum Action {62	Upload { action: Option<UploadAction> },63	Package(PackageAction),64}65impl Action {66	fn build_attr(&self) -> String {67		match self {68			Action::Upload { .. } => "toplevel".to_owned(),69			Action::Package(p) => p.build_attr(),70		}71	}72}7374impl From<Subcommand> for Action {75	fn from(s: Subcommand) -> Self {76		match s {77			Subcommand::Upload => Self::Upload { action: None },78			Subcommand::Test => Self::Upload {79				action: Some(UploadAction::Test),80			},81			Subcommand::Boot => Self::Upload {82				action: Some(UploadAction::Boot),83			},84			Subcommand::Switch => Self::Upload {85				action: Some(UploadAction::Switch),86			},87			Subcommand::SdImage => Self::Package(PackageAction::SdImage),88			Subcommand::InstallationCd => Self::Package(PackageAction::InstallationCd),89		}90	}91}9293#[derive(Parser, Clone)]94enum Subcommand {95	/// Upload, but do not switch96	Upload,97	/// Upload + switch to built system until reboot98	Test,99	/// Upload + switch to built system after reboot100	Boot,101	/// Upload + test + boot102	Switch,103104	/// Build SD .img image105	SdImage,106	/// Build an installation cd ISO image107	InstallationCd,108}109110struct Generation {111	id: u32,112	current: bool,113	datetime: String,114}115async fn get_current_generation(config: &Config, host: &str) -> Result<Generation> {116	let mut cmd = MyCommand::new("nix-env");117	cmd.comparg("--profile", "/nix/var/nix/profiles/system")118		.arg("--list-generations");119	// Sudo is required due to --list-generations acquiring lock on the profile.120	let data = config.run_string_on(host, cmd, true).await?;121	let generations = data122		.split('\n')123		.map(|e| e.trim())124		.filter(|&l| !l.is_empty())125		.filter_map(|g| {126			let gen: Option<Generation> = try {127				let mut parts = g.split_whitespace();128				let id = parts.next()?;129				let id: u32 = id.parse().ok()?;130				let date = parts.next()?;131				let time = parts.next()?;132				let current = if let Some(current) = parts.next() {133					if current == "(current)" {134						Some(true)135					} else {136						None137					}138				} else {139					Some(false)140				};141				let current = current?;142				if parts.next().is_some() {143					warn!("unexpected text after generation: {g}");144				}145				Generation {146					id,147					current,148					datetime: format!("{date} {time}"),149				}150			};151			if gen.is_none() {152				warn!("bad generation: {g}")153			}154			gen155		})156		.collect::<Vec<_>>();157	let current = generations158		.into_iter()159		.filter(|g| g.current)160		.at_most_one()161		.map_err(|_e| anyhow!("bad list-generations output"))?162		.ok_or_else(|| anyhow!("failed to find generation"))?;163	Ok(current)164}165166async fn systemctl_stop(config: &Config, host: &str, unit: &str) -> Result<()> {167	let mut cmd = MyCommand::new("systemctl");168	cmd.arg("stop").arg(unit);169	config.run_on(host, cmd, true).await170}171172async fn systemctl_start(config: &Config, host: &str, unit: &str) -> Result<()> {173	let mut cmd = MyCommand::new("systemctl");174	cmd.arg("start").arg(unit);175	config.run_on(host, cmd, true).await176}177178async fn execute_upload(179	build: &BuildSystems,180	config: &Config,181	action: UploadAction,182	host: &str,183	built: PathBuf,184) -> Result<()> {185	let mut failed = false;186	// TODO: Lockfile, to prevent concurrent system switch?187	// TODO: If rollback target exists - bail, it should be removed. Lockfile will not work in case if rollback188	// is scheduler on next boot (default behavior). On current boot - rollback activator will fail due to189	// unit name conflict in systemd-run190	// This code is tied to rollback.nix191	if !build.disable_rollback {192		let _span = info_span!("preparing").entered();193		info!("preparing for rollback");194		let generation = get_current_generation(config, host).await?;195		info!(196			"rollback target would be {} {}",197			generation.id, generation.datetime198		);199		{200			let mut cmd = MyCommand::new("sh");201			cmd.arg("-c").arg(format!("mark=$(mktemp -p /etc -t fleet_rollback_marker.XXXXX) && echo -n {} > $mark && mv --no-clobber $mark /etc/fleet_rollback_marker", generation.id));202			if let Err(e) = config.run_on(host, cmd, true).await {203				error!("failed to set rollback marker: {e}");204				failed = true;205			}206		}207		// Activation script also starts rollback-watchdog.timer, however, it is possible that it won't be started.208		// Kicking it on manually will work best.209		//210		// There wouldn't be conflict, because here we trigger start of the primary service, and systemd will211		// only allow one instance of it.212213		// TODO: We should also watch how this process is going.214		// After running this command, we have less than 3 minutes to deploy everything,215		// if we fail to perform generation switch in time, then we will still call the activation script, and this may break something.216		// Anyway, reboot will still help in this case.217		if action.should_schedule_rollback_run() {218			let mut cmd = MyCommand::new("systemd-run");219			cmd.comparg("--on-active", "3min")220				.comparg("--unit", "rollback-watchdog-run")221				.arg("systemctl")222				.arg("start")223				.arg("rollback-watchdog.service");224			if let Err(e) = config.run_on(host, cmd, true).await {225				error!("failed to schedule rollback run: {e}");226				failed = true;227			}228		}229	}230	if action.should_switch_profile() && !failed {231		info!("switching generation");232		let mut cmd = MyCommand::new("nix-env");233		cmd.comparg("--profile", "/nix/var/nix/profiles/system")234			.comparg("--set", &built);235		if let Err(e) = config.run_on(host, cmd, true).await {236			error!("failed to switch generation: {e}");237			failed = true;238		}239	}240	if action.should_activate() && !failed {241		let _span = info_span!("activating").entered();242		info!("executing activation script");243		let mut switch_script = built.clone();244		switch_script.push("bin");245		switch_script.push("switch-to-configuration");246		let mut cmd = MyCommand::new(switch_script);247		cmd.arg(action.name());248		if let Err(e) = config.run_on(host, cmd, true).in_current_span().await {249			error!("failed to activate: {e}");250			failed = true;251		}252	}253	if !build.disable_rollback {254		if failed {255			info!("executing rollback");256			if let Err(e) = systemctl_start(config, host, "rollback-watchdog.service")257				.instrument(info_span!("rollback"))258				.await259			{260				error!("failed to trigger rollback: {e}")261			}262		} else {263			info!("trying to mark upgrade as successful");264			let mut cmd = MyCommand::new("rm");265			cmd.arg("-f").arg("/etc/fleet_rollback_marker");266			if let Err(e) = config.run_on(host, cmd, true).in_current_span().await {267				error!("failed to remove rollback marker. This is bad, as the system will be rolled back by watchdog: {e}")268			}269		}270		info!("disarming watchdog, just in case");271		if let Err(_e) = systemctl_stop(config, host, "rollback-watchdog.timer").await {272			// It is ok, if there was no reboot - then timer might not be running.273		}274		if action.should_schedule_rollback_run() {275			if let Err(e) = systemctl_stop(config, host, "rollback-watchdog-run.timer").await {276				error!("failed to disarm rollback run: {e}");277			}278		}279	} else {280		let mut cmd = MyCommand::new("rm");281		cmd.arg("-f").arg("/etc/fleet_rollback_marker");282		if let Err(_e) = config.run_on(host, cmd, true).in_current_span().await {283			// Marker might not exist, yet better try to remove it.284		}285	}286	Ok(())287}288289impl BuildSystems {290	async fn build_task(self, config: Config, host: String) -> Result<()> {291		info!("building");292		let action = Action::from(self.subcommand.clone());293		let drv = config294			.fleet_field295			.select(nix_path!(.buildSystems((serde_json::json!({296				"localSystem": config.local_system.clone(),297			}))).{action.build_attr()}.{&host}))298			.await.context("system attribute")?;299		let outputs = drv.build().await.map_err(|e| {300			if action.build_attr() == "sdImage" {301				info!("sd-image build failed");302				info!("Make sure you have imported modulesPath/installer/sd-card/sd-image-<arch>[-installer].nix (For installer, you may want to check config)");303			}304			e305		})?;306		let out_output = outputs307			.get("out")308			.ok_or_else(|| anyhow!("system build should produce \"out\" output"))?;309310		match action {311			Action::Upload { action } => {312				if !config.is_local(&host) {313					info!("uploading system closure");314					{315						// Alternatively, nix store make-content-addressed can be used,316						// at least for the first deployment, to provide trusted store key.317						//318						// It is much slower, yet doesn't require root on the deployer machine.319						let mut sign = MyCommand::new("nix");320						// Private key for host machine is registered in nix-sign.nix321						sign.arg("store")322							.arg("sign")323							.comparg("--key-file", "/etc/nix/private-key")324							.arg("-r")325							.arg(out_output);326						if let Err(e) = sign.sudo().run_nix().await {327							warn!("Failed to sign store paths: {e}");328						};329					}330					let mut tries = 0;331					loop {332						let mut nix = MyCommand::new("nix");333						nix.arg("copy")334							.arg("--substitute-on-destination")335							.comparg("--to", format!("ssh-ng://{host}"))336							.arg(out_output);337						match nix.run_nix().await {338							Ok(()) => break,339							Err(e) if tries < 3 => {340								tries += 1;341								warn!("Copy failure ({}/3): {}", tries, e);342								sleep(Duration::from_millis(5000)).await;343							}344							Err(e) => return Err(e),345						}346					}347				}348				if let Some(action) = action {349					execute_upload(&self, &config, action, &host, out_output.clone()).await?350				}351			}352			Action::Package(PackageAction::SdImage) => {353				let mut out = current_dir()?;354				out.push(format!("sd-image-{}", host));355356				info!("linking sd image to {:?}", out);357				symlink(out_output, out)?;358			}359			Action::Package(PackageAction::InstallationCd) => {360				let mut out = current_dir()?;361				out.push(format!("installation-cd-{}", host));362363				info!("linking iso image to {:?}", out);364				symlink(out_output, out)?;365			}366		};367		Ok(())368	}369370	pub async fn run(self, config: &Config) -> Result<()> {371		let hosts = config.list_hosts().await?;372		let set = LocalSet::new();373		let this = &self;374		for host in hosts.into_iter() {375			if config.should_skip(&host.name) {376				continue;377			}378			let config = config.clone();379			let this = this.clone();380			let span = info_span!("deployment", host = field::display(&host.name));381			let hostname = host.name;382			set.spawn_local(383				(async move {384					match this.build_task(config, hostname).await {385						Ok(_) => {}386						Err(e) => {387							error!("failed to deploy host: {}", e)388						}389					}390				})391				.instrument(span),392			);393		}394		set.await;395		Ok(())396	}397}