git.delta.rocks / jrsonnet / refs/commits / fcad02a8426f

difftreelog

feat create gc root per built system

Yaroslav Bolyukin2024-11-19parent: #353fd26.patch.diff
in: trunk

5 files changed

modifiedCargo.lockdiffbeforeafterboth
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -962,6 +962,7 @@
  "nixlike",
  "nom",
  "openssh",
+ "rand",
  "serde",
  "serde_json",
  "tempfile",
modifiedcmds/fleet/src/cmds/build_systems.rsdiffbeforeafterboth
before · cmds/fleet/src/cmds/build_systems.rs
1use std::{env::current_dir, os::unix::fs::symlink, path::PathBuf, time::Duration};23use anyhow::{anyhow, Result};4use clap::{Parser, ValueEnum};5use fleet_base::{6	host::{Config, ConfigHost},7	opts::FleetOpts,8};9use itertools::Itertools as _;10use nix_eval::{nix_go, NixBuildBatch};11use tokio::{task::LocalSet, time::sleep};12use tracing::{error, field, info, info_span, warn, Instrument};1314#[derive(Parser)]15pub struct Deploy {16	/// Disable automatic rollback17	#[clap(long)]18	disable_rollback: bool,19	/// Action to execute after system is built20	action: DeployAction,21}2223#[derive(ValueEnum, Clone, Copy)]24enum DeployAction {25	/// Upload derivation, but do not execute the update.26	Upload,27	/// Upload and execute the activation script, old version will be used after reboot.28	Test,29	/// Upload and set as current system profile, but do not execute activation script.30	Boot,31	/// Upload, set current profile, and execute activation script.32	Switch,33}3435impl DeployAction {36	pub(crate) fn name(&self) -> Option<&'static str> {37		match self {38			Self::Upload => None,39			Self::Test => Some("test"),40			Self::Boot => Some("boot"),41			Self::Switch => Some("switch"),42		}43	}44	pub(crate) fn should_switch_profile(&self) -> bool {45		matches!(self, Self::Switch | Self::Boot)46	}47	pub(crate) fn should_activate(&self) -> bool {48		matches!(self, Self::Switch | Self::Test | Self::Boot)49	}50	pub(crate) fn should_create_rollback_marker(&self) -> bool {51		// Upload does nothing on the target machine, other than uploading the closure.52		// In boot case we want to have rollback marker prepared, so that the system may rollback itself on the next boot.53		!matches!(self, Self::Upload)54	}55	pub(crate) fn should_schedule_rollback_run(&self) -> bool {56		matches!(self, Self::Switch | Self::Test)57	}58}5960#[derive(Parser, Clone)]61pub struct BuildSystems {62	/// Attribute to build. Systems are deployed from "toplevel" attr, well-known used attributes63	/// are "sdImage"/"isoImage", and your configuration may include any other build attributes.64	#[clap(long, default_value = "toplevel")]65	build_attr: String,66}6768struct Generation {69	id: u32,70	current: bool,71	datetime: String,72}73async fn get_current_generation(host: &ConfigHost) -> Result<Generation> {74	let mut cmd = host.cmd("nix-env").await?;75	cmd.comparg("--profile", "/nix/var/nix/profiles/system")76		.arg("--list-generations");77	// Sudo is required due to --list-generations acquiring lock on the profile.78	let data = cmd.sudo().run_string().await?;79	let generations = data80		.split('\n')81		.map(|e| e.trim())82		.filter(|&l| !l.is_empty())83		.filter_map(|g| {84			let gen: Option<Generation> = try {85				let mut parts = g.split_whitespace();86				let id = parts.next()?;87				let id: u32 = id.parse().ok()?;88				let date = parts.next()?;89				let time = parts.next()?;90				let current = if let Some(current) = parts.next() {91					if current == "(current)" {92						Some(true)93					} else {94						None95					}96				} else {97					Some(false)98				};99				let current = current?;100				if parts.next().is_some() {101					warn!("unexpected text after generation: {g}");102				}103				Generation {104					id,105					current,106					datetime: format!("{date} {time}"),107				}108			};109			if gen.is_none() {110				warn!("bad generation: {g}")111			}112			gen113		})114		.collect::<Vec<_>>();115	let current = generations116		.into_iter()117		.filter(|g| g.current)118		.at_most_one()119		.map_err(|_e| anyhow!("bad list-generations output"))?120		.ok_or_else(|| anyhow!("failed to find generation"))?;121	Ok(current)122}123124async fn deploy_task(125	action: DeployAction,126	host: &ConfigHost,127	built: PathBuf,128	specialisation: Option<String>,129	disable_rollback: bool,130) -> Result<()> {131	let mut failed = false;132	// TODO: Lockfile, to prevent concurrent system switch?133	// TODO: If rollback target exists - bail, it should be removed. Lockfile will not work in case if rollback134	// is scheduler on next boot (default behavior). On current boot - rollback activator will fail due to135	// unit name conflict in systemd-run136	// This code is tied to rollback.nix137	if !disable_rollback && action.should_create_rollback_marker() {138		let _span = info_span!("preparing").entered();139		info!("preparing for rollback");140		let generation = get_current_generation(host).await?;141		info!(142			"rollback target would be {} {}",143			generation.id, generation.datetime144		);145		{146			let mut cmd = host.cmd("sh").await?;147			cmd.arg("-c").arg(format!("mark=$(mktemp -p /etc -t fleet_rollback_marker.XXXXX) && echo -n {} > $mark && mv --no-clobber $mark /etc/fleet_rollback_marker", generation.id));148			if let Err(e) = cmd.sudo().run().await {149				error!("failed to set rollback marker: {e}");150				failed = true;151			}152		}153		// Activation script also starts rollback-watchdog.timer, however, it is possible that it won't be started.154		// Kicking it on manually will work best.155		//156		// There wouldn't be conflict, because here we trigger start of the primary service, and systemd will157		// only allow one instance of it.158159		// TODO: We should also watch how this process is going.160		// After running this command, we have less than 3 minutes to deploy everything,161		// if we fail to perform generation switch in time, then we will still call the activation script, and this may break something.162		// Anyway, reboot will still help in this case.163		if action.should_schedule_rollback_run() {164			let mut cmd = host.cmd("systemd-run").await?;165			cmd.comparg("--on-active", "3min")166				.comparg("--unit", "rollback-watchdog-run")167				.arg("systemctl")168				.arg("start")169				.arg("rollback-watchdog.service");170			if let Err(e) = cmd.sudo().run().await {171				error!("failed to schedule rollback run: {e}");172				failed = true;173			}174		}175	}176177	if action.should_switch_profile() && !failed {178		info!("switching system profile generation");179		// It would also be possible to update profile atomically during copy:180		// https://github.com/NixOS/nix/pull/11657181		let mut cmd = host.cmd("nix").await?;182		cmd.arg("build");183		cmd.comparg("--profile", "/nix/var/nix/profiles/system");184		cmd.arg(&built);185		if let Err(e) = cmd.sudo().run_nix().await {186			error!("failed to switch system profile generation: {e}");187			failed = true;188		}189	}190191	// FIXME: Connection might be disconnected after activation run192193	if action.should_activate() && !failed {194		let _span = info_span!("activating").entered();195		info!("executing activation script");196		let specialised = if let Some(specialisation) = specialisation {197			let mut specialised = built.join("specialisation");198			specialised.push(specialisation);199			specialised200		} else {201			built.clone()202		};203		let switch_script = specialised.join("bin/switch-to-configuration");204		let mut cmd = host.cmd(switch_script).in_current_span().await?;205		cmd.arg(action.name().expect("upload.should_activate == false"));206		if let Err(e) = cmd.sudo().run().in_current_span().await {207			error!("failed to activate: {e}");208			failed = true;209		}210	}211	if action.should_create_rollback_marker() {212		if !disable_rollback {213			if failed {214				if action.should_schedule_rollback_run() {215					info!("executing rollback");216					if let Err(e) = host217						.systemctl_start("rollback-watchdog.service")218						.instrument(info_span!("rollback"))219						.await220					{221						error!("failed to trigger rollback: {e}")222					}223				}224			} else {225				info!("trying to mark upgrade as successful");226				if let Err(e) = host227					.rm_file("/etc/fleet_rollback_marker", true)228					.in_current_span()229					.await230				{231					error!("failed to remove rollback marker. This is bad, as the system will be rolled back by watchdog: {e}")232				}233			}234			info!("disarming watchdog, just in case");235			if let Err(_e) = host.systemctl_stop("rollback-watchdog.timer").await {236				// It is ok, if there was no reboot - then timer might not be running.237			}238			if action.should_schedule_rollback_run() {239				if let Err(e) = host.systemctl_stop("rollback-watchdog-run.timer").await {240					error!("failed to disarm rollback run: {e}");241				}242			}243		} else if let Err(_e) = host244			.rm_file("/etc/fleet_rollback_marker", true)245			.in_current_span()246			.await247		{248			// Marker might not exist, yet better try to remove it.249		}250	}251	Ok(())252}253254async fn build_task(255	config: Config,256	host: String,257	build_attr: &str,258	batch: Option<NixBuildBatch>,259) -> Result<PathBuf> {260	info!("building");261	let host = config.host(&host).await?;262	// let action = Action::from(self.subcommand.clone());263	let nixos = host.nixos_config().await?;264	let drv = nix_go!(nixos.system.build[{ build_attr }]);265	let outputs = drv.build_maybe_batch(batch).await?;266	let out_output = outputs267		.get("out")268		.ok_or_else(|| anyhow!("system build should produce \"out\" output"))?;269270	Ok(out_output.clone())271}272273impl BuildSystems {274	pub async fn run(self, config: &Config, opts: &FleetOpts) -> Result<()> {275		let hosts = opts.filter_skipped(config.list_hosts().await?).await?;276		let set = LocalSet::new();277		let build_attr = self.build_attr.clone();278		let batch = (hosts.len() > 1).then(|| {279			config280				.nix_session281				.new_build_batch("build-hosts".to_string())282		});283		for host in hosts {284			let config = config.clone();285			let span = info_span!("build", host = field::display(&host.name));286			let hostname = host.name;287			let build_attr = build_attr.clone();288			let batch = batch.clone();289			set.spawn_local(290				(async move {291					let built = match build_task(config, hostname.clone(), &build_attr, batch).await292					{293						Ok(path) => path,294						Err(e) => {295							error!("failed to deploy host: {}", e);296							return;297						}298					};299					// TODO: Handle error300					let mut out = current_dir().expect("cwd exists");301					out.push(format!("built-{}", hostname));302303					info!("linking iso image to {:?}", out);304					if let Err(e) = symlink(built, out) {305						error!("failed to symlink: {e}")306					}307				})308				.instrument(span),309			);310		}311		drop(batch);312		set.await;313		Ok(())314	}315}316317impl Deploy {318	pub async fn run(self, config: &Config, opts: &FleetOpts) -> Result<()> {319		let hosts = opts.filter_skipped(config.list_hosts().await?).await?;320		let set = LocalSet::new();321		let batch = (hosts.len() > 1).then(|| {322			config323				.nix_session324				.new_build_batch("deploy-hosts".to_string())325		});326		for host in hosts.into_iter() {327			let config = config.clone();328			let span = info_span!("deploy", host = field::display(&host.name));329			let hostname = host.name.clone();330			let local_host = config.local_host();331			let opts = opts.clone();332			let batch = batch.clone();333334			set.spawn_local(335				(async move {336					let built =337						match build_task(config.clone(), hostname.clone(), "toplevel", batch).await338						{339							Ok(path) => path,340							Err(e) => {341								error!("failed to deploy host: {}", e);342								return;343							}344						};345					if !opts.is_local(&hostname) {346						info!("uploading system closure");347						{348							// TODO: Move to remote_derivation method.349							// Alternatively, nix store make-content-addressed can be used,350							// at least for the first deployment, to provide trusted store key.351							//352							// It is much slower, yet doesn't require root on the deployer machine.353							let Ok(mut sign) = local_host.cmd("nix").await else {354								error!("failed to setup local");355								return;356							};357							// Private key for host machine is registered in nix-sign.nix358							sign.arg("store")359								.arg("sign")360								.comparg("--key-file", "/etc/nix/private-key")361								.arg("-r")362								.arg(&built);363							if let Err(e) = sign.sudo().run_nix().await {364								warn!("failed to sign store paths: {e}");365							};366						}367						let mut tries = 0;368						loop {369							match host.remote_derivation(&built).await {370								Ok(remote) => {371									assert!(remote == built, "CA derivations aren't implemented");372									break;373								}374								Err(e) if tries < 3 => {375									tries += 1;376									warn!("copy failure ({}/3): {}", tries, e);377									sleep(Duration::from_millis(5000)).await;378								}379								Err(e) => {380									error!("upload failed: {e}");381									return;382								}383							}384						}385					}386					if let Err(e) = deploy_task(387						self.action,388						&host,389						built,390						if let Ok(v) = opts.action_attr(&host, "specialisation").await {391							v392						} else {393							error!("unreachable? failed to get specialization");394							return;395						},396						self.disable_rollback,397					)398					.await399					{400						error!("activation failed: {e}");401					}402				})403				.instrument(span),404			);405		}406		drop(batch);407		set.await;408		Ok(())409	}410}
modifiedcrates/fleet-base/Cargo.tomldiffbeforeafterboth
--- a/crates/fleet-base/Cargo.toml
+++ b/crates/fleet-base/Cargo.toml
@@ -17,6 +17,7 @@
 nixlike.workspace = true
 nom = "7.1.3"
 openssh = "0.11.0"
+rand = "0.8.5"
 serde.workspace = true
 serde_json = "1.0.127"
 tempfile.workspace = true
modifiedcrates/fleet-base/src/fleetdata.rsdiffbeforeafterboth
--- a/crates/fleet-base/src/fleetdata.rs
+++ b/crates/fleet-base/src/fleetdata.rs
@@ -6,6 +6,10 @@
 use age::Recipient;
 use chrono::{DateTime, Utc};
 use fleet_shared::SecretData;
+use rand::{
+	distributions::{Alphanumeric, DistString},
+	thread_rng,
+};
 use serde::{de::Error, Deserialize, Serialize};
 use serde_json::Value;
 
@@ -42,10 +46,17 @@
 	}
 }
 
+fn generate_gc_prefix() -> String {
+	let id = Alphanumeric.sample_string(&mut thread_rng(), 8);
+	format!("fleet-gc-{id}")
+}
+
 #[derive(Serialize, Deserialize)]
 #[serde(rename_all = "camelCase")]
 pub struct FleetData {
 	pub version: FleetDataVersion,
+	#[serde(default = "generate_gc_prefix")]
+	pub gc_root_prefix: String,
 
 	#[serde(default)]
 	pub hosts: BTreeMap<String, HostData>,
modifiedmodules/hosts.nixdiffbeforeafterboth
--- a/modules/hosts.nix
+++ b/modules/hosts.nix
@@ -16,6 +16,10 @@
             type = str;
             internal = true;
           };
+          gcRootPrefix = mkOption {
+            type = str;
+            internal = true;
+          };
           hosts = mkOption {
             type = attrsOf (submodule {
               options.encryptionKey = mkOption {