git.delta.rocks / jrsonnet / refs/commits / f779c26f9056

difftreelog

feat ability to select specialisation to activate

Yaroslav Bolyukin2024-07-24parent: #d9fb30d.patch.diff
in: trunk

6 files changed

modifiedCargo.lockdiffbeforeafterboth
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -784,7 +784,7 @@
  "itertools",
  "nix-eval",
  "nixlike",
- "once_cell",
+ "nom",
  "openssh",
  "owo-colors",
  "peg",
modifiedcmds/fleet/Cargo.tomldiffbeforeafterboth
--- a/cmds/fleet/Cargo.toml
+++ b/cmds/fleet/Cargo.toml
@@ -19,7 +19,6 @@
 serde_json.workspace = true
 tempfile.workspace = true
 time = { version = "0.3", features = ["serde"] }
-once_cell = "1.19"
 hostname = "0.4.0"
 age-core = "0.10"
 peg = "0.8"
@@ -45,6 +44,7 @@
 human-repr = { version = "1.1", optional = true }
 indicatif = { version = "0.17", optional = true }
 nix-eval.workspace = true
+nom = "7.1.3"
 
 [features]
 # Not quite stable
modifiedcmds/fleet/src/cmds/build_systems.rsdiffbeforeafterboth
before · cmds/fleet/src/cmds/build_systems.rs
1use std::{env::current_dir, os::unix::fs::symlink, path::PathBuf, time::Duration};23use anyhow::{anyhow, Result};4use clap::{Parser, ValueEnum};5use itertools::Itertools as _;6use nix_eval::nix_go;7use tokio::{task::LocalSet, time::sleep};8use tracing::{error, field, info, info_span, warn, Instrument};910use crate::{11	command::MyCommand,12	host::{Config, ConfigHost},13};1415#[derive(Parser)]16pub struct Deploy {17	/// Disable automatic rollback18	#[clap(long)]19	disable_rollback: bool,20	/// Action to execute after system is built21	action: DeployAction,22}2324#[derive(ValueEnum, Clone, Copy)]25enum DeployAction {26	/// Upload derivation, but do not execute the update.27	Upload,28	/// Upload and execute the activation script, old version will be used after reboot.29	Test,30	/// Upload and set as current system profile, but do not execute activation script.31	Boot,32	/// Upload, set current profile, and execute activation script.33	Switch,34}3536impl DeployAction {37	pub(crate) fn name(&self) -> Option<&'static str> {38		match self {39			DeployAction::Upload => None,40			DeployAction::Test => Some("test"),41			DeployAction::Boot => Some("boot"),42			DeployAction::Switch => Some("switch"),43		}44	}45	pub(crate) fn should_switch_profile(&self) -> bool {46		matches!(self, Self::Switch | Self::Boot)47	}48	pub(crate) fn should_activate(&self) -> bool {49		matches!(self, Self::Switch | Self::Test)50	}51	pub(crate) fn should_create_rollback_marker(&self) -> bool {52		// Upload does nothing on the target machine, other than uploading the closure.53		// In boot case we want to have rollback marker prepared, so that the system may rollback itself on the next boot.54		!matches!(self, Self::Upload)55	}56	pub(crate) fn should_schedule_rollback_run(&self) -> bool {57		matches!(self, Self::Switch | Self::Test)58	}59}6061#[derive(Parser, Clone)]62pub struct BuildSystems {63	/// Attribute to build. Systems are deployed from "toplevel" attr, well-known used attributes64	/// are "sdImage"/"isoImage", and your configuration may include any other build attributes.65	#[clap(long, default_value = "toplevel")]66	build_attr: String,67}6869struct Generation {70	id: u32,71	current: bool,72	datetime: String,73}74async fn get_current_generation(host: &ConfigHost) -> Result<Generation> {75	let mut cmd = host.cmd("nix-env").await?;76	cmd.comparg("--profile", "/nix/var/nix/profiles/system")77		.arg("--list-generations");78	// Sudo is required due to --list-generations acquiring lock on the profile.79	let data = cmd.sudo().run_string().await?;80	let generations = data81		.split('\n')82		.map(|e| e.trim())83		.filter(|&l| !l.is_empty())84		.filter_map(|g| {85			let gen: Option<Generation> = try {86				let mut parts = g.split_whitespace();87				let id = parts.next()?;88				let id: u32 = id.parse().ok()?;89				let date = parts.next()?;90				let time = parts.next()?;91				let current = if let Some(current) = parts.next() {92					if current == "(current)" {93						Some(true)94					} else {95						None96					}97				} else {98					Some(false)99				};100				let current = current?;101				if parts.next().is_some() {102					warn!("unexpected text after generation: {g}");103				}104				Generation {105					id,106					current,107					datetime: format!("{date} {time}"),108				}109			};110			if gen.is_none() {111				warn!("bad generation: {g}")112			}113			gen114		})115		.collect::<Vec<_>>();116	let current = generations117		.into_iter()118		.filter(|g| g.current)119		.at_most_one()120		.map_err(|_e| anyhow!("bad list-generations output"))?121		.ok_or_else(|| anyhow!("failed to find generation"))?;122	Ok(current)123}124125async fn deploy_task(126	action: DeployAction,127	host: &ConfigHost,128	built: PathBuf,129	disable_rollback: bool,130) -> Result<()> {131	let mut failed = false;132	// TODO: Lockfile, to prevent concurrent system switch?133	// TODO: If rollback target exists - bail, it should be removed. Lockfile will not work in case if rollback134	// is scheduler on next boot (default behavior). On current boot - rollback activator will fail due to135	// unit name conflict in systemd-run136	// This code is tied to rollback.nix137	if !disable_rollback && action.should_create_rollback_marker() {138		let _span = info_span!("preparing").entered();139		info!("preparing for rollback");140		let generation = get_current_generation(host).await?;141		info!(142			"rollback target would be {} {}",143			generation.id, generation.datetime144		);145		{146			let mut cmd = host.cmd("sh").await?;147			cmd.arg("-c").arg(format!("mark=$(mktemp -p /etc -t fleet_rollback_marker.XXXXX) && echo -n {} > $mark && mv --no-clobber $mark /etc/fleet_rollback_marker", generation.id));148			if let Err(e) = cmd.sudo().run().await {149				error!("failed to set rollback marker: {e}");150				failed = true;151			}152		}153		// Activation script also starts rollback-watchdog.timer, however, it is possible that it won't be started.154		// Kicking it on manually will work best.155		//156		// There wouldn't be conflict, because here we trigger start of the primary service, and systemd will157		// only allow one instance of it.158159		// TODO: We should also watch how this process is going.160		// After running this command, we have less than 3 minutes to deploy everything,161		// if we fail to perform generation switch in time, then we will still call the activation script, and this may break something.162		// Anyway, reboot will still help in this case.163		if action.should_schedule_rollback_run() {164			let mut cmd = host.cmd("systemd-run").await?;165			cmd.comparg("--on-active", "3min")166				.comparg("--unit", "rollback-watchdog-run")167				.arg("systemctl")168				.arg("start")169				.arg("rollback-watchdog.service");170			if let Err(e) = cmd.sudo().run().await {171				error!("failed to schedule rollback run: {e}");172				failed = true;173			}174		}175	}176177	if action.should_switch_profile() && !failed {178		info!("switching generation");179		let mut cmd = host.cmd("nix-env").await?;180		cmd.comparg("--profile", "/nix/var/nix/profiles/system")181			.comparg("--set", &built);182		if let Err(e) = cmd.sudo().run().await {183			error!("failed to switch generation: {e}");184			failed = true;185		}186	}187188	// FIXME: Connection might be disconnected after activation run189190	if action.should_activate() && !failed {191		let _span = info_span!("activating").entered();192		info!("executing activation script");193		let mut switch_script = built.clone();194		switch_script.push("bin");195		switch_script.push("switch-to-configuration");196		let mut cmd = host.cmd(switch_script).in_current_span().await?;197		cmd.arg(action.name().expect("upload.should_activate == false"));198		if let Err(e) = cmd.sudo().run().in_current_span().await {199			error!("failed to activate: {e}");200			failed = true;201		}202	}203	if action.should_create_rollback_marker() {204		if !disable_rollback {205			if failed {206				if action.should_schedule_rollback_run() {207					info!("executing rollback");208					if let Err(e) = host209						.systemctl_start("rollback-watchdog.service")210						.instrument(info_span!("rollback"))211						.await212					{213						error!("failed to trigger rollback: {e}")214					}215				}216			} else {217				info!("trying to mark upgrade as successful");218				if let Err(e) = host219					.rm_file("/etc/fleet_rollback_marker", true)220					.in_current_span()221					.await222				{223					error!("failed to remove rollback marker. This is bad, as the system will be rolled back by watchdog: {e}")224				}225			}226			info!("disarming watchdog, just in case");227			if let Err(_e) = host.systemctl_stop("rollback-watchdog.timer").await {228				// It is ok, if there was no reboot - then timer might not be running.229			}230			if action.should_schedule_rollback_run() {231				if let Err(e) = host.systemctl_stop("rollback-watchdog-run.timer").await {232					error!("failed to disarm rollback run: {e}");233				}234			}235		} else if let Err(_e) = host236			.rm_file("/etc/fleet_rollback_marker", true)237			.in_current_span()238			.await239		{240			// Marker might not exist, yet better try to remove it.241		}242	}243	Ok(())244}245246async fn build_task(config: Config, host: String, build_attr: &str) -> Result<PathBuf> {247	info!("building");248	let host = config.host(&host).await?;249	// let action = Action::from(self.subcommand.clone());250	let fleet_config = &config.config_field;251	let drv = nix_go!(252		fleet_config.hosts[{ &host.name }]253			.nixosSystem254			.config255			.system256			.build[{ build_attr }]257	);258	let outputs = drv.build().await.map_err(|e| {259			if build_attr == "sdImage" {260				info!("sd-image build failed");261				info!("Make sure you have imported modulesPath/installer/sd-card/sd-image-<arch>[-installer].nix (For installer, you may want to check config)");262			}263			e264		})?;265	let out_output = outputs266		.get("out")267		.ok_or_else(|| anyhow!("system build should produce \"out\" output"))?;268269	Ok(out_output.clone())270}271272impl BuildSystems {273	pub async fn run(self, config: &Config) -> Result<()> {274		let hosts = config.list_hosts().await?;275		let set = LocalSet::new();276		let build_attr = self.build_attr.clone();277		for host in hosts.into_iter() {278			if config.should_skip(&host.name) {279				continue;280			}281			let config = config.clone();282			let span = info_span!("build", host = field::display(&host.name));283			let hostname = host.name;284			let build_attr = build_attr.clone();285			// FIXME: Since the introduction of better-nix-eval,286			// due to single repl used for builds, hosts are waiting for each other to build,287			// instead of building concurrently.288			//289			// Open multiple repls?290			//291			// Create build batcher, which will behave similar to golangs292			// WaitGroup, and start executing once all the build tasks are scheduled?293			// This also allows to cleanup build output, as there will be no longer294			// "waiting for remote machine" messages in the cases when one package is needed for295			// multiple hosts.296			set.spawn_local(297				(async move {298					let built = match build_task(config, hostname.clone(), &build_attr).await {299						Ok(path) => path,300						Err(e) => {301							error!("failed to deploy host: {}", e);302							return;303						}304					};305					// TODO: Handle error306					let mut out = current_dir().expect("cwd exists");307					out.push(format!("built-{}", hostname));308309					info!("linking iso image to {:?}", out);310					if let Err(e) = symlink(built, out) {311						error!("failed to symlink: {e}")312					}313				})314				.instrument(span),315			);316		}317		set.await;318		Ok(())319	}320}321322impl Deploy {323	pub async fn run(self, config: &Config) -> Result<()> {324		let hosts = config.list_hosts().await?;325		let set = LocalSet::new();326		for host in hosts.into_iter() {327			if config.should_skip(&host.name) {328				continue;329			}330			let config = config.clone();331			let span = info_span!("deploy", host = field::display(&host.name));332			let hostname = host.name.clone();333			// FIXME: Fix repl concurrency (see build-systems)334			set.spawn_local(335				(async move {336					let built = match build_task(config.clone(), hostname.clone(), "toplevel").await337					{338						Ok(path) => path,339						Err(e) => {340							error!("failed to deploy host: {}", e);341							return;342						}343					};344					if !config.is_local(&hostname) {345						info!("uploading system closure");346						{347							// TODO: Move to remote_derivation method.348							// Alternatively, nix store make-content-addressed can be used,349							// at least for the first deployment, to provide trusted store key.350							//351							// It is much slower, yet doesn't require root on the deployer machine.352							let mut sign = MyCommand::new("nix");353							// Private key for host machine is registered in nix-sign.nix354							sign.arg("store")355								.arg("sign")356								.comparg("--key-file", "/etc/nix/private-key")357								.arg("-r")358								.arg(&built);359							if let Err(e) = sign.sudo().run_nix().await {360								warn!("Failed to sign store paths: {e}");361							};362						}363						let mut tries = 0;364						loop {365							match host.remote_derivation(&built).await {366								Ok(remote) => {367									assert!(remote == built, "CA derivations aren't implemented");368									break;369								}370								Err(e) if tries < 3 => {371									tries += 1;372									warn!("copy failure ({}/3): {}", tries, e);373									sleep(Duration::from_millis(5000)).await;374								}375								Err(e) => {376									error!("upload failed: {e}");377									return;378								}379							}380						}381					}382					if let Err(e) =383						deploy_task(self.action, &host, built, self.disable_rollback).await384					{385						error!("activation failed: {e}");386					}387				})388				.instrument(span),389			);390		}391		set.await;392		Ok(())393	}394}
after · cmds/fleet/src/cmds/build_systems.rs
1use std::{env::current_dir, os::unix::fs::symlink, path::PathBuf, time::Duration};23use anyhow::{anyhow, Result};4use clap::{Parser, ValueEnum};5use itertools::Itertools as _;6use nix_eval::nix_go;7use tokio::{task::LocalSet, time::sleep};8use tracing::{error, field, info, info_span, warn, Instrument};910use crate::{11	command::MyCommand,12	host::{Config, ConfigHost},13};1415#[derive(Parser)]16pub struct Deploy {17	/// Disable automatic rollback18	#[clap(long)]19	disable_rollback: bool,20	/// Action to execute after system is built21	action: DeployAction,22}2324#[derive(ValueEnum, Clone, Copy)]25enum DeployAction {26	/// Upload derivation, but do not execute the update.27	Upload,28	/// Upload and execute the activation script, old version will be used after reboot.29	Test,30	/// Upload and set as current system profile, but do not execute activation script.31	Boot,32	/// Upload, set current profile, and execute activation script.33	Switch,34}3536impl DeployAction {37	pub(crate) fn name(&self) -> Option<&'static str> {38		match self {39			DeployAction::Upload => None,40			DeployAction::Test => Some("test"),41			DeployAction::Boot => Some("boot"),42			DeployAction::Switch => Some("switch"),43		}44	}45	pub(crate) fn should_switch_profile(&self) -> bool {46		matches!(self, Self::Switch | Self::Boot)47	}48	pub(crate) fn should_activate(&self) -> bool {49		matches!(self, Self::Switch | Self::Test)50	}51	pub(crate) fn should_create_rollback_marker(&self) -> bool {52		// Upload does nothing on the target machine, other than uploading the closure.53		// In boot case we want to have rollback marker prepared, so that the system may rollback itself on the next boot.54		!matches!(self, Self::Upload)55	}56	pub(crate) fn should_schedule_rollback_run(&self) -> bool {57		matches!(self, Self::Switch | Self::Test)58	}59}6061#[derive(Parser, Clone)]62pub struct BuildSystems {63	/// Attribute to build. Systems are deployed from "toplevel" attr, well-known used attributes64	/// are "sdImage"/"isoImage", and your configuration may include any other build attributes.65	#[clap(long, default_value = "toplevel")]66	build_attr: String,67}6869struct Generation {70	id: u32,71	current: bool,72	datetime: String,73}74async fn get_current_generation(host: &ConfigHost) -> Result<Generation> {75	let mut cmd = host.cmd("nix-env").await?;76	cmd.comparg("--profile", "/nix/var/nix/profiles/system")77		.arg("--list-generations");78	// Sudo is required due to --list-generations acquiring lock on the profile.79	let data = cmd.sudo().run_string().await?;80	let generations = data81		.split('\n')82		.map(|e| e.trim())83		.filter(|&l| !l.is_empty())84		.filter_map(|g| {85			let gen: Option<Generation> = try {86				let mut parts = g.split_whitespace();87				let id = parts.next()?;88				let id: u32 = id.parse().ok()?;89				let date = parts.next()?;90				let time = parts.next()?;91				let current = if let Some(current) = parts.next() {92					if current == "(current)" {93						Some(true)94					} else {95						None96					}97				} else {98					Some(false)99				};100				let current = current?;101				if parts.next().is_some() {102					warn!("unexpected text after generation: {g}");103				}104				Generation {105					id,106					current,107					datetime: format!("{date} {time}"),108				}109			};110			if gen.is_none() {111				warn!("bad generation: {g}")112			}113			gen114		})115		.collect::<Vec<_>>();116	let current = generations117		.into_iter()118		.filter(|g| g.current)119		.at_most_one()120		.map_err(|_e| anyhow!("bad list-generations output"))?121		.ok_or_else(|| anyhow!("failed to find generation"))?;122	Ok(current)123}124125async fn deploy_task(126	action: DeployAction,127	host: &ConfigHost,128	built: PathBuf,129	specialisation: Option<String>,130	disable_rollback: bool,131) -> Result<()> {132	let mut failed = false;133	// TODO: Lockfile, to prevent concurrent system switch?134	// TODO: If rollback target exists - bail, it should be removed. Lockfile will not work in case if rollback135	// is scheduler on next boot (default behavior). On current boot - rollback activator will fail due to136	// unit name conflict in systemd-run137	// This code is tied to rollback.nix138	if !disable_rollback && action.should_create_rollback_marker() {139		let _span = info_span!("preparing").entered();140		info!("preparing for rollback");141		let generation = get_current_generation(host).await?;142		info!(143			"rollback target would be {} {}",144			generation.id, generation.datetime145		);146		{147			let mut cmd = host.cmd("sh").await?;148			cmd.arg("-c").arg(format!("mark=$(mktemp -p /etc -t fleet_rollback_marker.XXXXX) && echo -n {} > $mark && mv --no-clobber $mark /etc/fleet_rollback_marker", generation.id));149			if let Err(e) = cmd.sudo().run().await {150				error!("failed to set rollback marker: {e}");151				failed = true;152			}153		}154		// Activation script also starts rollback-watchdog.timer, however, it is possible that it won't be started.155		// Kicking it on manually will work best.156		//157		// There wouldn't be conflict, because here we trigger start of the primary service, and systemd will158		// only allow one instance of it.159160		// TODO: We should also watch how this process is going.161		// After running this command, we have less than 3 minutes to deploy everything,162		// if we fail to perform generation switch in time, then we will still call the activation script, and this may break something.163		// Anyway, reboot will still help in this case.164		if action.should_schedule_rollback_run() {165			let mut cmd = host.cmd("systemd-run").await?;166			cmd.comparg("--on-active", "3min")167				.comparg("--unit", "rollback-watchdog-run")168				.arg("systemctl")169				.arg("start")170				.arg("rollback-watchdog.service");171			if let Err(e) = cmd.sudo().run().await {172				error!("failed to schedule rollback run: {e}");173				failed = true;174			}175		}176	}177178	if action.should_switch_profile() && !failed {179		info!("switching generation");180		let mut cmd = host.cmd("nix-env").await?;181		cmd.comparg("--profile", "/nix/var/nix/profiles/system")182			.comparg("--set", &built);183		if let Err(e) = cmd.sudo().run().await {184			error!("failed to switch generation: {e}");185			failed = true;186		}187	}188189	// FIXME: Connection might be disconnected after activation run190191	if action.should_activate() && !failed {192		let _span = info_span!("activating").entered();193		info!("executing activation script");194		let specialised = if let Some(specialisation) = specialisation {195			let mut specialised = built.join("specialisation");196			specialised.push(specialisation);197			specialised198		} else {199			built.clone()200		};201		let switch_script = specialised.join("bin/switch-to-configuration");202		let mut cmd = host.cmd(switch_script).in_current_span().await?;203		cmd.arg(action.name().expect("upload.should_activate == false"));204		if let Err(e) = cmd.sudo().run().in_current_span().await {205			error!("failed to activate: {e}");206			failed = true;207		}208	}209	if action.should_create_rollback_marker() {210		if !disable_rollback {211			if failed {212				if action.should_schedule_rollback_run() {213					info!("executing rollback");214					if let Err(e) = host215						.systemctl_start("rollback-watchdog.service")216						.instrument(info_span!("rollback"))217						.await218					{219						error!("failed to trigger rollback: {e}")220					}221				}222			} else {223				info!("trying to mark upgrade as successful");224				if let Err(e) = host225					.rm_file("/etc/fleet_rollback_marker", true)226					.in_current_span()227					.await228				{229					error!("failed to remove rollback marker. This is bad, as the system will be rolled back by watchdog: {e}")230				}231			}232			info!("disarming watchdog, just in case");233			if let Err(_e) = host.systemctl_stop("rollback-watchdog.timer").await {234				// It is ok, if there was no reboot - then timer might not be running.235			}236			if action.should_schedule_rollback_run() {237				if let Err(e) = host.systemctl_stop("rollback-watchdog-run.timer").await {238					error!("failed to disarm rollback run: {e}");239				}240			}241		} else if let Err(_e) = host242			.rm_file("/etc/fleet_rollback_marker", true)243			.in_current_span()244			.await245		{246			// Marker might not exist, yet better try to remove it.247		}248	}249	Ok(())250}251252async fn build_task(config: Config, host: String, build_attr: &str) -> Result<PathBuf> {253	info!("building");254	let host = config.host(&host).await?;255	// let action = Action::from(self.subcommand.clone());256	let fleet_config = &config.config_field;257	let drv = nix_go!(258		fleet_config.hosts[{ &host.name }]259			.nixosSystem260			.config261			.system262			.build[{ build_attr }]263	);264	let outputs = drv.build().await.inspect_err(|_| {265			if build_attr == "sdImage" {266				info!("sd-image build failed");267				info!("Make sure you have imported modulesPath/installer/sd-card/sd-image-<arch>[-installer].nix (For installer, you may want to check config)");268			}269		})?;270	let out_output = outputs271		.get("out")272		.ok_or_else(|| anyhow!("system build should produce \"out\" output"))?;273274	Ok(out_output.clone())275}276277impl BuildSystems {278	pub async fn run(self, config: &Config) -> Result<()> {279		let hosts = config.list_hosts().await?;280		let set = LocalSet::new();281		let build_attr = self.build_attr.clone();282		for host in hosts.into_iter() {283			if config.should_skip(&host).await? {284				continue;285			}286			let config = config.clone();287			let span = info_span!("build", host = field::display(&host.name));288			let hostname = host.name;289			let build_attr = build_attr.clone();290			// FIXME: Since the introduction of better-nix-eval,291			// due to single repl used for builds, hosts are waiting for each other to build,292			// instead of building concurrently.293			//294			// Open multiple repls?295			//296			// Create build batcher, which will behave similar to golangs297			// WaitGroup, and start executing once all the build tasks are scheduled?298			// This also allows to cleanup build output, as there will be no longer299			// "waiting for remote machine" messages in the cases when one package is needed for300			// multiple hosts.301			set.spawn_local(302				(async move {303					let built = match build_task(config, hostname.clone(), &build_attr).await {304						Ok(path) => path,305						Err(e) => {306							error!("failed to deploy host: {}", e);307							return;308						}309					};310					// TODO: Handle error311					let mut out = current_dir().expect("cwd exists");312					out.push(format!("built-{}", hostname));313314					info!("linking iso image to {:?}", out);315					if let Err(e) = symlink(built, out) {316						error!("failed to symlink: {e}")317					}318				})319				.instrument(span),320			);321		}322		set.await;323		Ok(())324	}325}326327impl Deploy {328	pub async fn run(self, config: &Config) -> Result<()> {329		let hosts = config.list_hosts().await?;330		let set = LocalSet::new();331		for host in hosts.into_iter() {332			if config.should_skip(&host).await? {333				continue;334			}335			let config = config.clone();336			let span = info_span!("deploy", host = field::display(&host.name));337			let hostname = host.name.clone();338			// FIXME: Fix repl concurrency (see build-systems)339			set.spawn_local(340				(async move {341					let built = match build_task(config.clone(), hostname.clone(), "toplevel").await342					{343						Ok(path) => path,344						Err(e) => {345							error!("failed to deploy host: {}", e);346							return;347						}348					};349					if !config.is_local(&hostname) {350						info!("uploading system closure");351						{352							// TODO: Move to remote_derivation method.353							// Alternatively, nix store make-content-addressed can be used,354							// at least for the first deployment, to provide trusted store key.355							//356							// It is much slower, yet doesn't require root on the deployer machine.357							let mut sign = MyCommand::new("nix");358							// Private key for host machine is registered in nix-sign.nix359							sign.arg("store")360								.arg("sign")361								.comparg("--key-file", "/etc/nix/private-key")362								.arg("-r")363								.arg(&built);364							if let Err(e) = sign.sudo().run_nix().await {365								warn!("Failed to sign store paths: {e}");366							};367						}368						let mut tries = 0;369						loop {370							match host.remote_derivation(&built).await {371								Ok(remote) => {372									assert!(remote == built, "CA derivations aren't implemented");373									break;374								}375								Err(e) if tries < 3 => {376									tries += 1;377									warn!("copy failure ({}/3): {}", tries, e);378									sleep(Duration::from_millis(5000)).await;379								}380								Err(e) => {381									error!("upload failed: {e}");382									return;383								}384							}385						}386					}387					if let Err(e) = deploy_task(388						self.action,389						&host,390						built,391						if let Ok(v) = config.action_attr(&host, "specialisation").await {392							v393						} else {394							error!("unreachable? failed to get specialization");395							return;396						},397						self.disable_rollback,398					)399					.await400					{401						error!("activation failed: {e}");402					}403				})404				.instrument(span),405			);406		}407		set.await;408		Ok(())409	}410}
modifiedcmds/fleet/src/cmds/secrets/mod.rsdiffbeforeafterboth
--- a/cmds/fleet/src/cmds/secrets/mod.rs
+++ b/cmds/fleet/src/cmds/secrets/mod.rs
@@ -436,7 +436,7 @@
 		match self {
 			Secret::ForceKeys => {
 				for host in config.list_hosts().await? {
-					if config.should_skip(&host.name) {
+					if config.should_skip(&host).await? {
 						continue;
 					}
 					config.key(&host.name).await?;
@@ -639,7 +639,7 @@
 					}
 				}
 				for host in config.list_hosts().await? {
-					if config.should_skip(&host.name) {
+					if config.should_skip(&host).await? {
 						continue;
 					}
 
modifiedcmds/fleet/src/host.rsdiffbeforeafterboth
--- a/cmds/fleet/src/host.rs
+++ b/cmds/fleet/src/host.rs
@@ -1,4 +1,6 @@
 use std::{
+	cell::OnceCell,
+	collections::BTreeMap,
 	env::current_dir,
 	ffi::{OsStr, OsString},
 	fmt::Display,
@@ -10,9 +12,16 @@
 };
 
 use anyhow::{anyhow, bail, ensure, Context, Result};
-use clap::{ArgGroup, Parser};
+use clap::Parser;
 use fleet_shared::SecretData;
 use nix_eval::{nix_go, nix_go_json, NixSessionPool, Value};
+use nom::{
+	bytes::complete::take_while1,
+	character::complete::char,
+	combinator::{map, opt},
+	multi::separated_list1,
+	sequence::{preceded, separated_pair},
+};
 use openssh::SessionBuilder;
 use serde::de::DeserializeOwned;
 use tempfile::NamedTempFile;
@@ -53,10 +62,26 @@
 	pub name: String,
 	pub local: bool,
 	pub session: OnceLock<Arc<openssh::Session>>,
+	groups: OnceCell<Vec<String>>,
 
 	pub nixos_config: Option<Value>,
 }
 impl ConfigHost {
+	pub async fn tags(&self) -> Result<Vec<String>> {
+		if let Some(v) = self.groups.get() {
+			return Ok(v.clone());
+		}
+		// TOCTOU is possible here in case if config is changed, but this case is not handled anywhere anyway,
+		// assuming getting tags always returns the same value.
+		let Some(nixos_config) = &self.nixos_config else {
+			return Ok(vec![]);
+		};
+		let tags: Vec<String> = nix_go_json!(nixos_config.tags);
+
+		let _ = self.groups.set(tags.clone());
+
+		Ok(tags)
+	}
 	async fn open_session(&self) -> Result<Arc<openssh::Session>> {
 		assert!(!self.local, "do not open ssh connection to local session");
 		// FIXME: TOCTOU
@@ -217,15 +242,71 @@
 }
 
 impl Config {
-	pub fn should_skip(&self, host: &str) -> bool {
-		if !self.opts.skip.is_empty() {
-			self.opts.skip.iter().any(|h| h as &str == host)
-		} else if !self.opts.only.is_empty() {
-			!self.opts.only.iter().any(|h| h as &str == host)
-		} else {
-			false
+	pub async fn should_skip(&self, host: &ConfigHost) -> Result<bool> {
+		if !self.opts.skip.is_empty() && self.opts.skip.iter().any(|h| h as &str == host.name) {
+			return Ok(true);
+		}
+		if self.opts.only.is_empty() {
+			return Ok(false);
+		}
+		let mut have_group_matches = false;
+		for item in self.opts.only.iter() {
+			match item {
+				HostItem::Host { name, .. } if *name == host.name => {
+					return Ok(false);
+				}
+				HostItem::Tag { .. } => {
+					have_group_matches = true;
+				}
+				_ => {}
+			}
 		}
+		if have_group_matches {
+			let host_tags = host.tags().await?;
+			for item in self.opts.only.iter() {
+				match item {
+					HostItem::Tag { name, .. } if host_tags.contains(name) => {
+						return Ok(false);
+					}
+					_ => {}
+				}
+			}
+		}
+		Ok(true)
 	}
+	pub async fn action_attr(&self, host: &ConfigHost, attr: &str) -> Result<Option<String>> {
+		if self.opts.only.is_empty() {
+			return Ok(None);
+		}
+		let mut have_group_matches = false;
+		for item in self.opts.only.iter() {
+			match item {
+				HostItem::Host { name, attrs }
+					if *name == host.name && attrs.contains_key(attr) =>
+				{
+					return Ok(attrs.get(attr).cloned());
+				}
+				HostItem::Tag { attrs, .. } if attrs.contains_key(attr) => {
+					have_group_matches = true;
+				}
+				_ => {}
+			}
+		}
+		if have_group_matches {
+			let host_tags = host.tags().await?;
+			for item in self.opts.only.iter() {
+				match item {
+					HostItem::Tag { name, attrs }
+						if host_tags.contains(name) && attrs.contains_key(attr) =>
+					{
+						return Ok(attrs.get(attr).cloned());
+					}
+					_ => {}
+				}
+			}
+		}
+		Ok(None)
+	}
 	pub fn is_local(&self, host: &str) -> bool {
 		self.opts.localhost.as_ref().map(|s| s as &str) == Some(host)
 	}
@@ -237,6 +318,11 @@
 			local: true,
 			session: OnceLock::new(),
 			nixos_config: None,
+			groups: {
+				let cell = OnceCell::new();
+				let _ = cell.set(vec![]);
+				cell
+			},
 		}
 	}
 
@@ -249,6 +335,7 @@
 			local: self.is_local(name),
 			session: OnceLock::new(),
 			nixos_config: Some(nixos_config),
+			groups: OnceCell::new(),
 		})
 	}
 	pub async fn list_hosts(&self) -> Result<Vec<ConfigHost>> {
@@ -356,15 +443,59 @@
 	}
 }
 
+#[derive(Clone)]
+enum HostItem {
+	Host {
+		name: String,
+		attrs: BTreeMap<String, String>,
+	},
+	Tag {
+		name: String,
+		attrs: BTreeMap<String, String>,
+	},
+}
+fn host_item_parser(input: &str) -> Result<HostItem, String> {
+	fn err_to_string(err: nom::Err<nom::error::Error<&str>>) -> String {
+		err.to_string()
+	}
+
+	let (input, is_tag) = map(opt(char('@')), |c| c.is_some())(input).map_err(err_to_string)?;
+	let (input, name) = map(
+		take_while1(|v| v != ',' && v != '?' && v != '@'),
+		str::to_owned,
+	)(input)
+	.map_err(err_to_string)?;
+
+	let kw_item = separated_pair(
+		map(take_while1(|v| v != '&' && v != '='), str::to_owned),
+		char('='),
+		map(take_while1(|v| v != '&'), str::to_owned),
+	);
+	let kw = map(separated_list1(char('&'), kw_item), |vec| {
+		vec.into_iter().collect::<BTreeMap<_, _>>()
+	});
+	let mut opt_kw = map(opt(preceded(char('?'), kw)), Option::unwrap_or_default);
+
+	let (input, attrs) = opt_kw(input).map_err(err_to_string)?;
+
+	if !input.is_empty() {
+		return Err(format!("unexpected trailing input: {input:?}"));
+	}
+	Ok(if is_tag {
+		HostItem::Tag { name, attrs }
+	} else {
+		HostItem::Host { name, attrs }
+	})
+}
+
 #[derive(Parser, Clone)]
-#[clap(group = ArgGroup::new("target_hosts"))]
 pub struct FleetOpts {
 	/// All hosts except those would be skipped
-	#[clap(long, number_of_values = 1, group = "target_hosts")]
-	only: Vec<String>,
+	#[clap(long, number_of_values = 1, value_parser = host_item_parser)]
+	only: Vec<HostItem>,
 
 	/// Hosts to skip
-	#[clap(long, number_of_values = 1, group = "target_hosts")]
+	#[clap(long, number_of_values = 1)]
 	skip: Vec<String>,
 
 	/// Host, which should be threaten as current machine
modifiedflake.lockdiffbeforeafterboth
--- a/flake.lock
+++ b/flake.lock
@@ -7,11 +7,11 @@
         ]
       },
       "locked": {
-        "lastModified": 1720226507,
-        "narHash": "sha256-yHVvNsgrpyNTXZBEokL8uyB2J6gB1wEx0KOJzoeZi1A=",
+        "lastModified": 1721699339,
+        "narHash": "sha256-UqtSwU13vpzzM6w8tGghEbA7ObM3NCDzSpz19QQo9XE=",
         "owner": "ipetkov",
         "repo": "crane",
-        "rev": "0aed560c5c0a61c9385bddff471a13036203e11c",
+        "rev": "0081e9c447f3b70822c142908f08ceeb436982b8",
         "type": "github"
       },
       "original": {
@@ -40,11 +40,11 @@
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1720525988,
-        "narHash": "sha256-6Vvrwl2rKrRt5gAYTFlM/pihCwHw8SY2o81TBm7KhIQ=",
+        "lastModified": 1721814637,
+        "narHash": "sha256-L3QkCvxeByJfW45wLkdZ9pL5h9PezOwwfx7G2sRfjiU=",
         "owner": "nixos",
         "repo": "nixpkgs",
-        "rev": "a630e7a8476e51b116f1ca7444dbad20701823d7",
+        "rev": "e0c444a0b8413a31df199052f5714d409dc4c1d0",
         "type": "github"
       },
       "original": {
@@ -68,11 +68,11 @@
     },
     "nixpkgs-stable-for-tests": {
       "locked": {
-        "lastModified": 1720386169,
-        "narHash": "sha256-NGKVY4PjzwAa4upkGtAMz1npHGoRzWotlSnVlqI40mo=",
+        "lastModified": 1721548954,
+        "narHash": "sha256-7cCC8+Tdq1+3OPyc3+gVo9dzUNkNIQfwSDJ2HSi2u3o=",
         "owner": "nixos",
         "repo": "nixpkgs",
-        "rev": "194846768975b7ad2c4988bdb82572c00222c0d7",
+        "rev": "63d37ccd2d178d54e7fb691d7ec76000740ea24a",
         "type": "github"
       },
       "original": {
@@ -98,11 +98,11 @@
         ]
       },
       "locked": {
-        "lastModified": 1720491570,
-        "narHash": "sha256-PHS2BcQ9kxBpu9GKlDg3uAlrX/ahQOoAiVmwGl6BjD4=",
+        "lastModified": 1721810656,
+        "narHash": "sha256-33UCMmgPL+sz06+iupNkl99hcBABP56ENcxSoKqr0TY=",
         "owner": "oxalica",
         "repo": "rust-overlay",
-        "rev": "b970af40fdc4bd80fd764796c5f97c15e2b564eb",
+        "rev": "a6afdaab4a47d6ecf647a74968e92a51c4a18e5a",
         "type": "github"
       },
       "original": {