git.delta.rocks / jrsonnet / refs/commits / 3972fee37ee3

difftreelog

feat explicitly mark hosts as managed by fleet

Lach2025-04-05parent: #a1a72ce.patch.diff
in: trunk

7 files changed

modifiedCargo.lockdiffbeforeafterboth
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -924,6 +924,7 @@
  "hostname",
  "human-repr",
  "indicatif",
+ "indoc",
  "itertools 0.13.0",
  "nix-eval",
  "nixlike",
@@ -1537,6 +1538,12 @@
 ]
 
 [[package]]
+name = "indoc"
+version = "2.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd"
+
+[[package]]
 name = "inout"
 version = "0.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
modifiedcmds/fleet/Cargo.tomldiffbeforeafterboth
--- a/cmds/fleet/Cargo.toml
+++ b/cmds/fleet/Cargo.toml
@@ -47,6 +47,7 @@
 nix-eval.workspace = true
 nom = "7.1.3"
 fleet-base = { version = "0.1.0", path = "../../crates/fleet-base" }
+indoc = "2.0.6"
 
 [features]
 default = ["indicatif"]
modifiedcmds/fleet/src/cmds/build_systems.rsdiffbeforeafterboth
after · cmds/fleet/src/cmds/build_systems.rs
1use std::{env::current_dir, os::unix::fs::symlink, path::PathBuf, str::FromStr, time::Duration};23use anyhow::{anyhow, bail, Result};4use clap::{Parser, ValueEnum};5use fleet_base::{6	host::{Config, ConfigHost},7	opts::FleetOpts,8};9use itertools::Itertools as _;10use nix_eval::{nix_go, NixBuildBatch};11use tokio::{task::LocalSet, time::sleep};12use tracing::{error, field, info, info_span, warn, Instrument};1314#[derive(Parser)]15pub struct Deploy {16	/// Disable automatic rollback17	#[clap(long)]18	disable_rollback: bool,19	/// Action to execute after system is built20	action: DeployAction,21}2223#[derive(ValueEnum, Clone, Copy)]24enum DeployAction {25	/// Upload derivation, but do not execute the update.26	Upload,27	/// Upload and execute the activation script, old version will be used after reboot.28	Test,29	/// Upload and set as current system profile, but do not execute activation script.30	Boot,31	/// Upload, set current profile, and execute activation script.32	Switch,33}3435impl DeployAction {36	pub(crate) fn name(&self) -> Option<&'static str> {37		match self {38			Self::Upload => None,39			Self::Test => Some("test"),40			Self::Boot => Some("boot"),41			Self::Switch => Some("switch"),42		}43	}44	pub(crate) fn should_switch_profile(&self) -> bool {45		matches!(self, Self::Switch | Self::Boot)46	}47	pub(crate) fn should_activate(&self) -> bool {48		matches!(self, Self::Switch | Self::Test | Self::Boot)49	}50	pub(crate) fn should_create_rollback_marker(&self) -> bool {51		// Upload does nothing on the target machine, other than uploading the closure.52		// In boot case we want to have rollback marker prepared, so that the system may rollback itself on the next boot.53		!matches!(self, Self::Upload)54	}55	pub(crate) fn should_schedule_rollback_run(&self) -> bool {56		matches!(self, Self::Switch | Self::Test)57	}58}5960#[derive(Parser, Clone)]61pub struct BuildSystems {62	/// Attribute to build. Systems are deployed from "toplevel" attr, well-known used attributes63	/// are "sdImage"/"isoImage", and your configuration may include any other build attributes.64	#[clap(long, default_value = "toplevel")]65	build_attr: String,66}6768struct Generation {69	id: u32,70	current: bool,71	datetime: String,72}7374fn parse_generation_line(g: &str) -> Option<Generation> {75	let mut parts = g.split_whitespace();76	let id = parts.next()?;77	let id: u32 = id.parse().ok()?;78	let date = parts.next()?;79	let time = parts.next()?;80	let current = if let Some(current) = parts.next() {81		if current == "(current)" {82			Some(true)83		} else {84			None85		}86	} else {87		Some(false)88	};89	let current = current?;90	if parts.next().is_some() {91		warn!("unexpected text after generation: {g}");92	}93	Some(Generation {94		id,95		current,96		datetime: format!("{date} {time}"),97	})98}99100async fn get_current_generation(host: &ConfigHost) -> Result<Generation> {101	let mut cmd = host.cmd("nix-env").await?;102	cmd.comparg("--profile", "/nix/var/nix/profiles/system")103		.arg("--list-generations");104	// Sudo is required due to --list-generations acquiring lock on the profile.105	let data = cmd.sudo().run_string().await?;106	let generations = data107		.split('\n')108		.map(|e| e.trim())109		.filter(|&l| !l.is_empty())110		.filter_map(|g| {111			let gen = parse_generation_line(g);112			if gen.is_none() {113				warn!("bad generation: {g}");114			}115			gen116		})117		.collect::<Vec<_>>();118	let current = generations119		.into_iter()120		.filter(|g| g.current)121		.at_most_one()122		.map_err(|_e| anyhow!("bad list-generations output"))?123		.ok_or_else(|| anyhow!("failed to find generation"))?;124	Ok(current)125}126127async fn deploy_task(128	action: DeployAction,129	host: &ConfigHost,130	built: PathBuf,131	specialisation: Option<String>,132	disable_rollback: bool,133) -> Result<()> {134	let mut failed = false;135136	// TODO: Lockfile, to prevent concurrent system switch?137	// TODO: If rollback target exists - bail, it should be removed. Lockfile will not work in case if rollback138	// is scheduler on next boot (default behavior). On current boot - rollback activator will fail due to139	// unit name conflict in systemd-run140	// This code is tied to rollback.nix141	if !disable_rollback && action.should_create_rollback_marker() {142		let _span = info_span!("preparing").entered();143		info!("preparing for rollback");144		let generation = get_current_generation(host).await?;145		info!(146			"rollback target would be {} {}",147			generation.id, generation.datetime148		);149		{150			let mut cmd = host.cmd("sh").await?;151			cmd.arg("-c").arg(format!("mark=$(mktemp -p /etc -t fleet_rollback_marker.XXXXX) && echo -n {} > $mark && mv --no-clobber $mark /etc/fleet_rollback_marker", generation.id));152			if let Err(e) = cmd.sudo().run().await {153				error!("failed to set rollback marker: {e}");154				failed = true;155			}156		}157		// Activation script also starts rollback-watchdog.timer, however, it is possible that it won't be started.158		// Kicking it on manually will work best.159		//160		// There wouldn't be conflict, because here we trigger start of the primary service, and systemd will161		// only allow one instance of it.162163		// TODO: We should also watch how this process is going.164		// After running this command, we have less than 3 minutes to deploy everything,165		// if we fail to perform generation switch in time, then we will still call the activation script, and this may break something.166		// Anyway, reboot will still help in this case.167		if action.should_schedule_rollback_run() {168			let mut cmd = host.cmd("systemd-run").await?;169			cmd.comparg("--on-active", "3min")170				.comparg("--unit", "rollback-watchdog-run")171				.arg("systemctl")172				.arg("start")173				.arg("rollback-watchdog.service");174			if let Err(e) = cmd.sudo().run().await {175				error!("failed to schedule rollback run: {e}");176				failed = true;177			}178		}179	}180181	if action.should_switch_profile() && !failed {182		info!("switching system profile generation");183		// It would also be possible to update profile atomically during copy:184		// https://github.com/NixOS/nix/pull/11657185		let mut cmd = host.cmd("nix").await?;186		cmd.arg("build");187		cmd.comparg("--profile", "/nix/var/nix/profiles/system");188		cmd.arg(&built);189		if let Err(e) = cmd.sudo().run_nix().await {190			error!("failed to switch system profile generation: {e}");191			failed = true;192		}193	}194195	// FIXME: Connection might be disconnected after activation run196197	if action.should_activate() && !failed {198		let _span = info_span!("activating").entered();199		info!("executing activation script");200		let specialised = if let Some(specialisation) = specialisation {201			let mut specialised = built.join("specialisation");202			specialised.push(specialisation);203			specialised204		} else {205			built.clone()206		};207		let switch_script = specialised.join("bin/switch-to-configuration");208		let mut cmd = host.cmd(switch_script).in_current_span().await?;209		cmd.arg(action.name().expect("upload.should_activate == false"));210		if let Err(e) = cmd.sudo().run().in_current_span().await {211			error!("failed to activate: {e}");212			failed = true;213		}214	}215	if action.should_create_rollback_marker() {216		if !disable_rollback {217			if failed {218				if action.should_schedule_rollback_run() {219					info!("executing rollback");220					if let Err(e) = host221						.systemctl_start("rollback-watchdog.service")222						.instrument(info_span!("rollback"))223						.await224					{225						error!("failed to trigger rollback: {e}")226					}227				}228			} else {229				info!("trying to mark upgrade as successful");230				if let Err(e) = host231					.rm_file("/etc/fleet_rollback_marker", true)232					.in_current_span()233					.await234				{235					error!("failed to remove rollback marker. This is bad, as the system will be rolled back by watchdog: {e}")236				}237			}238			info!("disarming watchdog, just in case");239			if let Err(_e) = host.systemctl_stop("rollback-watchdog.timer").await {240				// It is ok, if there was no reboot - then timer might not be running.241			}242			if action.should_schedule_rollback_run() {243				if let Err(e) = host.systemctl_stop("rollback-watchdog-run.timer").await {244					error!("failed to disarm rollback run: {e}");245				}246			}247		} else if let Err(_e) = host248			.rm_file("/etc/fleet_rollback_marker", true)249			.in_current_span()250			.await251		{252			// Marker might not exist, yet better try to remove it.253		}254	}255	Ok(())256}257258async fn build_task(259	config: Config,260	hostname: String,261	build_attr: &str,262	batch: Option<NixBuildBatch>,263) -> Result<PathBuf> {264	info!("building");265	let host = config.host(&hostname).await?;266	// let action = Action::from(self.subcommand.clone());267	let nixos = host.nixos_config().await?;268	let drv = nix_go!(nixos.system.build[{ build_attr }]);269	let outputs = drv.build_maybe_batch(batch).await?;270	let out_output = outputs271		.get("out")272		.ok_or_else(|| anyhow!("system build should produce \"out\" output"))?;273274	{275		info!("adding gc root");276		let mut cmd = config.local_host().cmd("nix").await?;277		cmd.arg("build")278			.comparg(279				"--profile",280				format!(281					"/nix/var/nix/profiles/{}-{hostname}",282					config.data().gc_root_prefix283				),284			)285			.arg(out_output);286		cmd.sudo().run_nix().await?;287	}288289	Ok(out_output.clone())290}291292impl BuildSystems {293	pub async fn run(self, config: &Config, opts: &FleetOpts) -> Result<()> {294		let hosts = opts.filter_skipped(config.list_hosts().await?).await?;295		let set = LocalSet::new();296		let build_attr = self.build_attr.clone();297		let batch = (hosts.len() > 1).then(|| {298			config299				.nix_session300				.new_build_batch("build-hosts".to_string())301		});302		for host in hosts {303			let config = config.clone();304			let span = info_span!("build", host = field::display(&host.name));305			let hostname = host.name;306			let build_attr = build_attr.clone();307			let batch = batch.clone();308			set.spawn_local(309				(async move {310					let built = match build_task(config, hostname.clone(), &build_attr, batch).await311					{312						Ok(path) => path,313						Err(e) => {314							error!("failed to deploy host: {}", e);315							return;316						}317					};318					// TODO: Handle error319					let mut out = current_dir().expect("cwd exists");320					out.push(format!("built-{}", hostname));321322					info!("linking iso image to {:?}", out);323					if let Err(e) = symlink(built, out) {324						error!("failed to symlink: {e}")325					}326				})327				.instrument(span),328			);329		}330		drop(batch);331		set.await;332		Ok(())333	}334}335336#[derive(Clone, PartialEq, Copy)]337enum DeployKind {338	// NixOS => NixOS managed by fleet339	UpgradeToFleet,340	// NixOS managed by fleet => NixOS managed by fleet341	Fleet,342}343impl FromStr for DeployKind {344	type Err = anyhow::Error;345	fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {346		match s {347			"upgrade-to-fleet" => Ok(Self::UpgradeToFleet),348			"fleet" => Ok(Self::Fleet),349			v => bail!("unknown deploy_kind: {v}; expected on of \"upgrade-to-fleet\", \"fleet\""),350		}351	}352}353354impl Deploy {355	pub async fn run(self, config: &Config, opts: &FleetOpts) -> Result<()> {356		let hosts = opts.filter_skipped(config.list_hosts().await?).await?;357		let set = LocalSet::new();358		let batch = (hosts.len() > 1).then(|| {359			config360				.nix_session361				.new_build_batch("deploy-hosts".to_string())362		});363		for host in hosts.into_iter() {364			let config = config.clone();365			let span = info_span!("deploy", host = field::display(&host.name));366			let hostname = host.name.clone();367			let local_host = config.local_host();368			let opts = opts.clone();369			let batch = batch.clone();370			let mut deploy_kind: Option<DeployKind> =371				opts.action_attr(&host, "deploy_kind").await?;372373			set.spawn_local(374				(async move {375					let built =376						match build_task(config.clone(), hostname.clone(), "toplevel", batch).await377						{378							Ok(path) => path,379							Err(e) => {380								error!("failed to build host system closure: {}", e);381								return;382							}383						};384					if deploy_kind == None {385						let is_fleet_managed = match host.file_exists("/etc/FLEET_HOST").await {386							Ok(v) => v,387							Err(e) => {388								error!("failed to query remote system kind: {}", e);389								return;390							},391						};392						if !is_fleet_managed {393							error!(indoc::indoc!{"394								host is not marked as managed by fleet395								if you're not trying to lustrate/install system from scratch,396								you should either397									1. manually create /etc/FLEET_HOST file on the target host,398									2. use ?deploy_kind=fleet host argument if you're upgrading from older version of fleet399									3. use ?deploy_kind=upgrade_to_fleet if you're upgrading from plain nixos to fleet-managed nixos400							"});401							return;402						}403						deploy_kind = Some(DeployKind::Fleet);404					}405					let deploy_kind = deploy_kind.expect("deploy_kind is set");406407					// TODO: Make disable_rollback a host attribute instead408					let mut disable_rollback = self.disable_rollback;409					if !disable_rollback && deploy_kind != DeployKind::Fleet {410						warn!("disabling rollback, as not supported by non-fleet deployment kinds");411						disable_rollback = true;412					}413414					if !opts.is_local(&hostname) {415						info!("uploading system closure");416						{417							// TODO: Move to remote_derivation method.418							// Alternatively, nix store make-content-addressed can be used,419							// at least for the first deployment, to provide trusted store key.420							//421							// It is much slower, yet doesn't require root on the deployer machine.422							let Ok(mut sign) = local_host.cmd("nix").await else {423								error!("failed to setup local");424								return;425							};426							// Private key for host machine is registered in nix-sign.nix427							sign.arg("store")428								.arg("sign")429								.comparg("--key-file", "/etc/nix/private-key")430								.arg("-r")431								.arg(&built);432							if let Err(e) = sign.sudo().run_nix().await {433								warn!("failed to sign store paths: {e}");434							};435						}436						let mut tries = 0;437						loop {438							match host.remote_derivation(&built).await {439								Ok(remote) => {440									assert!(remote == built, "CA derivations aren't implemented");441									break;442								}443								Err(e) if tries < 3 => {444									tries += 1;445									warn!("copy failure ({}/3): {}", tries, e);446									sleep(Duration::from_millis(5000)).await;447								}448								Err(e) => {449									error!("upload failed: {e}");450									return;451								}452							}453						}454					}455					if let Err(e) = deploy_task(456						self.action,457						&host,458						built,459						if let Ok(v) = opts.action_attr(&host, "specialisation").await {460							v461						} else {462							error!("unreachable? failed to get specialization");463							return;464						},465						disable_rollback,466					)467					.await468					{469						error!("activation failed: {e}");470					}471				})472				.instrument(span),473			);474		}475		drop(batch);476		set.await;477		Ok(())478	}479}
modifiedcmds/fleet/src/main.rsdiffbeforeafterboth
--- a/cmds/fleet/src/main.rs
+++ b/cmds/fleet/src/main.rs
@@ -66,9 +66,9 @@
 
 #[derive(Parser)]
 enum Opts {
-	/// Prepare systems for deployments
+	/// Build system closures
 	BuildSystems(BuildSystems),
-
+	/// Upload and switch system closures
 	Deploy(Deploy),
 	/// Secret management
 	#[clap(subcommand)]
modifiedcrates/fleet-base/src/command.rsdiffbeforeafterboth
--- a/crates/fleet-base/src/command.rs
+++ b/crates/fleet-base/src/command.rs
@@ -5,6 +5,7 @@
 use futures::StreamExt;
 use itertools::Either;
 use openssh::{OverSsh, OwningCommand, Session};
+use serde::de::DeserializeOwned;
 use tokio::{io::AsyncRead, process::Command, select};
 use tokio_util::codec::{BytesCodec, FramedRead, LinesCodec};
 use tracing::debug;
@@ -230,6 +231,10 @@
 		let bytes = self.run_bytes().await?;
 		Ok(String::from_utf8(bytes)?)
 	}
+	pub async fn run_value<T: DeserializeOwned>(self) -> Result<T> {
+		let v = self.run_string().await?;
+		Ok(serde_json::from_str(&v)?)
+	}
 	pub async fn run_bytes(self) -> Result<Vec<u8>> {
 		let str = self.clone().into_string();
 		let cmd = self.wrap_sudo_if_needed().into_command()?;
modifiedcrates/fleet-base/src/host.rsdiffbeforeafterboth
--- a/crates/fleet-base/src/host.rs
+++ b/crates/fleet-base/src/host.rs
@@ -105,6 +105,14 @@
 		let path = cmd.run_string().await?;
 		Ok(path.trim_end().to_owned())
 	}
+	pub async fn file_exists(&self, path: impl AsRef<OsStr>) -> Result<bool> {
+		let mut cmd = self.cmd("sh").await?;
+		cmd.arg("-c")
+			.arg("test -e \"$1\" && echo true || echo false")
+			.arg("_")
+			.arg(path);
+		Ok(cmd.run_value().await?)
+	}
 	pub async fn read_file_bin(&self, path: impl AsRef<OsStr>) -> Result<Vec<u8>> {
 		let mut cmd = self.cmd("cat").await?;
 		cmd.arg(path);
modifiedmodules/nixos/meta.nixdiffbeforeafterboth
--- a/modules/nixos/meta.nix
+++ b/modules/nixos/meta.nix
@@ -1,8 +1,17 @@
-{lib, ...}: let
+{ lib, ... }:
+let
   inherit (lib.modules) mkRemovedOptionModule;
-in {
+in
+{
   imports = [
-    (mkRemovedOptionModule ["tags"] "tags are now defined at the host level, not the nixos system level for fast filtering without evaluating unnecessary hosts.")
-    (mkRemovedOptionModule ["network"] "network is now defined at the host level, not the nixos system level")
+    (mkRemovedOptionModule [ "tags" ]
+      "tags are now defined at the host level, not the nixos system level for fast filtering without evaluating unnecessary hosts."
+    )
+    (mkRemovedOptionModule [
+      "network"
+    ] "network is now defined at the host level, not the nixos system level")
   ];
+
+  # Version of environment (fleet scripts such as rollback) already installed on the host
+  config.environment.etc.FLEET_HOST.text = "1";
 }