difftreelog
feat explicitly mark hosts as managed by fleet
in: trunk
7 files changed
Cargo.lockdiffbeforeafterboth--- a/Cargo.lock
+++ b/Cargo.lock
@@ -924,6 +924,7 @@
"hostname",
"human-repr",
"indicatif",
+ "indoc",
"itertools 0.13.0",
"nix-eval",
"nixlike",
@@ -1537,6 +1538,12 @@
]
[[package]]
+name = "indoc"
+version = "2.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd"
+
+[[package]]
name = "inout"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
cmds/fleet/Cargo.tomldiffbeforeafterboth--- a/cmds/fleet/Cargo.toml
+++ b/cmds/fleet/Cargo.toml
@@ -47,6 +47,7 @@
nix-eval.workspace = true
nom = "7.1.3"
fleet-base = { version = "0.1.0", path = "../../crates/fleet-base" }
+indoc = "2.0.6"
[features]
default = ["indicatif"]
cmds/fleet/src/cmds/build_systems.rsdiffbeforeafterboth1use std::{env::current_dir, os::unix::fs::symlink, path::PathBuf, str::FromStr, time::Duration};23use anyhow::{anyhow, bail, Result};4use clap::{Parser, ValueEnum};5use fleet_base::{6 host::{Config, ConfigHost},7 opts::FleetOpts,8};9use itertools::Itertools as _;10use nix_eval::{nix_go, NixBuildBatch};11use tokio::{task::LocalSet, time::sleep};12use tracing::{error, field, info, info_span, warn, Instrument};1314#[derive(Parser)]15pub struct Deploy {16 /// Disable automatic rollback17 #[clap(long)]18 disable_rollback: bool,19 /// Action to execute after system is built20 action: DeployAction,21}2223#[derive(ValueEnum, Clone, Copy)]24enum DeployAction {25 /// Upload derivation, but do not execute the update.26 Upload,27 /// Upload and execute the activation script, old version will be used after reboot.28 Test,29 /// Upload and set as current system profile, but do not execute activation script.30 Boot,31 /// Upload, set current profile, and execute activation script.32 Switch,33}3435impl DeployAction {36 pub(crate) fn name(&self) -> Option<&'static str> {37 match self {38 Self::Upload => None,39 Self::Test => Some("test"),40 Self::Boot => Some("boot"),41 Self::Switch => Some("switch"),42 }43 }44 pub(crate) fn should_switch_profile(&self) -> bool {45 matches!(self, Self::Switch | Self::Boot)46 }47 pub(crate) fn should_activate(&self) -> bool {48 matches!(self, Self::Switch | Self::Test | Self::Boot)49 }50 pub(crate) fn should_create_rollback_marker(&self) -> bool {51 // Upload does nothing on the target machine, other than uploading the closure.52 // In boot case we want to have rollback marker prepared, so that the system may rollback itself on the next boot.53 !matches!(self, Self::Upload)54 }55 pub(crate) fn should_schedule_rollback_run(&self) -> bool {56 matches!(self, Self::Switch | Self::Test)57 }58}5960#[derive(Parser, Clone)]61pub struct BuildSystems {62 /// Attribute to build. Systems are deployed from "toplevel" attr, well-known used attributes63 /// are "sdImage"/"isoImage", and your configuration may include any other build attributes.64 #[clap(long, default_value = "toplevel")]65 build_attr: String,66}6768struct Generation {69 id: u32,70 current: bool,71 datetime: String,72}7374fn parse_generation_line(g: &str) -> Option<Generation> {75 let mut parts = g.split_whitespace();76 let id = parts.next()?;77 let id: u32 = id.parse().ok()?;78 let date = parts.next()?;79 let time = parts.next()?;80 let current = if let Some(current) = parts.next() {81 if current == "(current)" {82 Some(true)83 } else {84 None85 }86 } else {87 Some(false)88 };89 let current = current?;90 if parts.next().is_some() {91 warn!("unexpected text after generation: {g}");92 }93 Some(Generation {94 id,95 current,96 datetime: format!("{date} {time}"),97 })98}99100async fn get_current_generation(host: &ConfigHost) -> Result<Generation> {101 let mut cmd = host.cmd("nix-env").await?;102 cmd.comparg("--profile", "/nix/var/nix/profiles/system")103 .arg("--list-generations");104 // Sudo is required due to --list-generations acquiring lock on the profile.105 let data = cmd.sudo().run_string().await?;106 let generations = data107 .split('\n')108 .map(|e| e.trim())109 .filter(|&l| !l.is_empty())110 .filter_map(|g| {111 let gen = parse_generation_line(g);112 if gen.is_none() {113 warn!("bad generation: {g}");114 }115 gen116 })117 .collect::<Vec<_>>();118 let current = generations119 .into_iter()120 .filter(|g| g.current)121 .at_most_one()122 .map_err(|_e| anyhow!("bad list-generations output"))?123 .ok_or_else(|| anyhow!("failed to find generation"))?;124 Ok(current)125}126127async fn deploy_task(128 action: DeployAction,129 host: &ConfigHost,130 built: PathBuf,131 specialisation: Option<String>,132 disable_rollback: bool,133) -> Result<()> {134 let mut failed = false;135136 // TODO: Lockfile, to prevent concurrent system switch?137 // TODO: If rollback target exists - bail, it should be removed. Lockfile will not work in case if rollback138 // is scheduler on next boot (default behavior). On current boot - rollback activator will fail due to139 // unit name conflict in systemd-run140 // This code is tied to rollback.nix141 if !disable_rollback && action.should_create_rollback_marker() {142 let _span = info_span!("preparing").entered();143 info!("preparing for rollback");144 let generation = get_current_generation(host).await?;145 info!(146 "rollback target would be {} {}",147 generation.id, generation.datetime148 );149 {150 let mut cmd = host.cmd("sh").await?;151 cmd.arg("-c").arg(format!("mark=$(mktemp -p /etc -t fleet_rollback_marker.XXXXX) && echo -n {} > $mark && mv --no-clobber $mark /etc/fleet_rollback_marker", generation.id));152 if let Err(e) = cmd.sudo().run().await {153 error!("failed to set rollback marker: {e}");154 failed = true;155 }156 }157 // Activation script also starts rollback-watchdog.timer, however, it is possible that it won't be started.158 // Kicking it on manually will work best.159 //160 // There wouldn't be conflict, because here we trigger start of the primary service, and systemd will161 // only allow one instance of it.162163 // TODO: We should also watch how this process is going.164 // After running this command, we have less than 3 minutes to deploy everything,165 // if we fail to perform generation switch in time, then we will still call the activation script, and this may break something.166 // Anyway, reboot will still help in this case.167 if action.should_schedule_rollback_run() {168 let mut cmd = host.cmd("systemd-run").await?;169 cmd.comparg("--on-active", "3min")170 .comparg("--unit", "rollback-watchdog-run")171 .arg("systemctl")172 .arg("start")173 .arg("rollback-watchdog.service");174 if let Err(e) = cmd.sudo().run().await {175 error!("failed to schedule rollback run: {e}");176 failed = true;177 }178 }179 }180181 if action.should_switch_profile() && !failed {182 info!("switching system profile generation");183 // It would also be possible to update profile atomically during copy:184 // https://github.com/NixOS/nix/pull/11657185 let mut cmd = host.cmd("nix").await?;186 cmd.arg("build");187 cmd.comparg("--profile", "/nix/var/nix/profiles/system");188 cmd.arg(&built);189 if let Err(e) = cmd.sudo().run_nix().await {190 error!("failed to switch system profile generation: {e}");191 failed = true;192 }193 }194195 // FIXME: Connection might be disconnected after activation run196197 if action.should_activate() && !failed {198 let _span = info_span!("activating").entered();199 info!("executing activation script");200 let specialised = if let Some(specialisation) = specialisation {201 let mut specialised = built.join("specialisation");202 specialised.push(specialisation);203 specialised204 } else {205 built.clone()206 };207 let switch_script = specialised.join("bin/switch-to-configuration");208 let mut cmd = host.cmd(switch_script).in_current_span().await?;209 cmd.arg(action.name().expect("upload.should_activate == false"));210 if let Err(e) = cmd.sudo().run().in_current_span().await {211 error!("failed to activate: {e}");212 failed = true;213 }214 }215 if action.should_create_rollback_marker() {216 if !disable_rollback {217 if failed {218 if action.should_schedule_rollback_run() {219 info!("executing rollback");220 if let Err(e) = host221 .systemctl_start("rollback-watchdog.service")222 .instrument(info_span!("rollback"))223 .await224 {225 error!("failed to trigger rollback: {e}")226 }227 }228 } else {229 info!("trying to mark upgrade as successful");230 if let Err(e) = host231 .rm_file("/etc/fleet_rollback_marker", true)232 .in_current_span()233 .await234 {235 error!("failed to remove rollback marker. This is bad, as the system will be rolled back by watchdog: {e}")236 }237 }238 info!("disarming watchdog, just in case");239 if let Err(_e) = host.systemctl_stop("rollback-watchdog.timer").await {240 // It is ok, if there was no reboot - then timer might not be running.241 }242 if action.should_schedule_rollback_run() {243 if let Err(e) = host.systemctl_stop("rollback-watchdog-run.timer").await {244 error!("failed to disarm rollback run: {e}");245 }246 }247 } else if let Err(_e) = host248 .rm_file("/etc/fleet_rollback_marker", true)249 .in_current_span()250 .await251 {252 // Marker might not exist, yet better try to remove it.253 }254 }255 Ok(())256}257258async fn build_task(259 config: Config,260 hostname: String,261 build_attr: &str,262 batch: Option<NixBuildBatch>,263) -> Result<PathBuf> {264 info!("building");265 let host = config.host(&hostname).await?;266 // let action = Action::from(self.subcommand.clone());267 let nixos = host.nixos_config().await?;268 let drv = nix_go!(nixos.system.build[{ build_attr }]);269 let outputs = drv.build_maybe_batch(batch).await?;270 let out_output = outputs271 .get("out")272 .ok_or_else(|| anyhow!("system build should produce \"out\" output"))?;273274 {275 info!("adding gc root");276 let mut cmd = config.local_host().cmd("nix").await?;277 cmd.arg("build")278 .comparg(279 "--profile",280 format!(281 "/nix/var/nix/profiles/{}-{hostname}",282 config.data().gc_root_prefix283 ),284 )285 .arg(out_output);286 cmd.sudo().run_nix().await?;287 }288289 Ok(out_output.clone())290}291292impl BuildSystems {293 pub async fn run(self, config: &Config, opts: &FleetOpts) -> Result<()> {294 let hosts = opts.filter_skipped(config.list_hosts().await?).await?;295 let set = LocalSet::new();296 let build_attr = self.build_attr.clone();297 let batch = (hosts.len() > 1).then(|| {298 config299 .nix_session300 .new_build_batch("build-hosts".to_string())301 });302 for host in hosts {303 let config = config.clone();304 let span = info_span!("build", host = field::display(&host.name));305 let hostname = host.name;306 let build_attr = build_attr.clone();307 let batch = batch.clone();308 set.spawn_local(309 (async move {310 let built = match build_task(config, hostname.clone(), &build_attr, batch).await311 {312 Ok(path) => path,313 Err(e) => {314 error!("failed to deploy host: {}", e);315 return;316 }317 };318 // TODO: Handle error319 let mut out = current_dir().expect("cwd exists");320 out.push(format!("built-{}", hostname));321322 info!("linking iso image to {:?}", out);323 if let Err(e) = symlink(built, out) {324 error!("failed to symlink: {e}")325 }326 })327 .instrument(span),328 );329 }330 drop(batch);331 set.await;332 Ok(())333 }334}335336#[derive(Clone, PartialEq, Copy)]337enum DeployKind {338 // NixOS => NixOS managed by fleet339 UpgradeToFleet,340 // NixOS managed by fleet => NixOS managed by fleet341 Fleet,342}343impl FromStr for DeployKind {344 type Err = anyhow::Error;345 fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {346 match s {347 "upgrade-to-fleet" => Ok(Self::UpgradeToFleet),348 "fleet" => Ok(Self::Fleet),349 v => bail!("unknown deploy_kind: {v}; expected on of \"upgrade-to-fleet\", \"fleet\""),350 }351 }352}353354impl Deploy {355 pub async fn run(self, config: &Config, opts: &FleetOpts) -> Result<()> {356 let hosts = opts.filter_skipped(config.list_hosts().await?).await?;357 let set = LocalSet::new();358 let batch = (hosts.len() > 1).then(|| {359 config360 .nix_session361 .new_build_batch("deploy-hosts".to_string())362 });363 for host in hosts.into_iter() {364 let config = config.clone();365 let span = info_span!("deploy", host = field::display(&host.name));366 let hostname = host.name.clone();367 let local_host = config.local_host();368 let opts = opts.clone();369 let batch = batch.clone();370 let mut deploy_kind: Option<DeployKind> =371 opts.action_attr(&host, "deploy_kind").await?;372373 set.spawn_local(374 (async move {375 let built =376 match build_task(config.clone(), hostname.clone(), "toplevel", batch).await377 {378 Ok(path) => path,379 Err(e) => {380 error!("failed to build host system closure: {}", e);381 return;382 }383 };384 if deploy_kind == None {385 let is_fleet_managed = match host.file_exists("/etc/FLEET_HOST").await {386 Ok(v) => v,387 Err(e) => {388 error!("failed to query remote system kind: {}", e);389 return;390 },391 };392 if !is_fleet_managed {393 error!(indoc::indoc!{"394 host is not marked as managed by fleet395 if you're not trying to lustrate/install system from scratch,396 you should either397 1. manually create /etc/FLEET_HOST file on the target host,398 2. use ?deploy_kind=fleet host argument if you're upgrading from older version of fleet399 3. use ?deploy_kind=upgrade_to_fleet if you're upgrading from plain nixos to fleet-managed nixos400 "});401 return;402 }403 deploy_kind = Some(DeployKind::Fleet);404 }405 let deploy_kind = deploy_kind.expect("deploy_kind is set");406407 // TODO: Make disable_rollback a host attribute instead408 let mut disable_rollback = self.disable_rollback;409 if !disable_rollback && deploy_kind != DeployKind::Fleet {410 warn!("disabling rollback, as not supported by non-fleet deployment kinds");411 disable_rollback = true;412 }413414 if !opts.is_local(&hostname) {415 info!("uploading system closure");416 {417 // TODO: Move to remote_derivation method.418 // Alternatively, nix store make-content-addressed can be used,419 // at least for the first deployment, to provide trusted store key.420 //421 // It is much slower, yet doesn't require root on the deployer machine.422 let Ok(mut sign) = local_host.cmd("nix").await else {423 error!("failed to setup local");424 return;425 };426 // Private key for host machine is registered in nix-sign.nix427 sign.arg("store")428 .arg("sign")429 .comparg("--key-file", "/etc/nix/private-key")430 .arg("-r")431 .arg(&built);432 if let Err(e) = sign.sudo().run_nix().await {433 warn!("failed to sign store paths: {e}");434 };435 }436 let mut tries = 0;437 loop {438 match host.remote_derivation(&built).await {439 Ok(remote) => {440 assert!(remote == built, "CA derivations aren't implemented");441 break;442 }443 Err(e) if tries < 3 => {444 tries += 1;445 warn!("copy failure ({}/3): {}", tries, e);446 sleep(Duration::from_millis(5000)).await;447 }448 Err(e) => {449 error!("upload failed: {e}");450 return;451 }452 }453 }454 }455 if let Err(e) = deploy_task(456 self.action,457 &host,458 built,459 if let Ok(v) = opts.action_attr(&host, "specialisation").await {460 v461 } else {462 error!("unreachable? failed to get specialization");463 return;464 },465 disable_rollback,466 )467 .await468 {469 error!("activation failed: {e}");470 }471 })472 .instrument(span),473 );474 }475 drop(batch);476 set.await;477 Ok(())478 }479}cmds/fleet/src/main.rsdiffbeforeafterboth--- a/cmds/fleet/src/main.rs
+++ b/cmds/fleet/src/main.rs
@@ -66,9 +66,9 @@
#[derive(Parser)]
enum Opts {
- /// Prepare systems for deployments
+ /// Build system closures
BuildSystems(BuildSystems),
-
+ /// Upload and switch system closures
Deploy(Deploy),
/// Secret management
#[clap(subcommand)]
crates/fleet-base/src/command.rsdiffbeforeafterboth--- a/crates/fleet-base/src/command.rs
+++ b/crates/fleet-base/src/command.rs
@@ -5,6 +5,7 @@
use futures::StreamExt;
use itertools::Either;
use openssh::{OverSsh, OwningCommand, Session};
+use serde::de::DeserializeOwned;
use tokio::{io::AsyncRead, process::Command, select};
use tokio_util::codec::{BytesCodec, FramedRead, LinesCodec};
use tracing::debug;
@@ -230,6 +231,10 @@
let bytes = self.run_bytes().await?;
Ok(String::from_utf8(bytes)?)
}
+ pub async fn run_value<T: DeserializeOwned>(self) -> Result<T> {
+ let v = self.run_string().await?;
+ Ok(serde_json::from_str(&v)?)
+ }
pub async fn run_bytes(self) -> Result<Vec<u8>> {
let str = self.clone().into_string();
let cmd = self.wrap_sudo_if_needed().into_command()?;
crates/fleet-base/src/host.rsdiffbeforeafterboth--- a/crates/fleet-base/src/host.rs
+++ b/crates/fleet-base/src/host.rs
@@ -105,6 +105,14 @@
let path = cmd.run_string().await?;
Ok(path.trim_end().to_owned())
}
+ pub async fn file_exists(&self, path: impl AsRef<OsStr>) -> Result<bool> {
+ let mut cmd = self.cmd("sh").await?;
+ cmd.arg("-c")
+ .arg("test -e \"$1\" && echo true || echo false")
+ .arg("_")
+ .arg(path);
+ Ok(cmd.run_value().await?)
+ }
pub async fn read_file_bin(&self, path: impl AsRef<OsStr>) -> Result<Vec<u8>> {
let mut cmd = self.cmd("cat").await?;
cmd.arg(path);
modules/nixos/meta.nixdiffbeforeafterboth--- a/modules/nixos/meta.nix
+++ b/modules/nixos/meta.nix
@@ -1,8 +1,17 @@
-{lib, ...}: let
+{ lib, ... }:
+let
inherit (lib.modules) mkRemovedOptionModule;
-in {
+in
+{
imports = [
- (mkRemovedOptionModule ["tags"] "tags are now defined at the host level, not the nixos system level for fast filtering without evaluating unnecessary hosts.")
- (mkRemovedOptionModule ["network"] "network is now defined at the host level, not the nixos system level")
+ (mkRemovedOptionModule [ "tags" ]
+ "tags are now defined at the host level, not the nixos system level for fast filtering without evaluating unnecessary hosts."
+ )
+ (mkRemovedOptionModule [
+ "network"
+ ] "network is now defined at the host level, not the nixos system level")
];
+
+ # Version of environment (fleet scripts such as rollback) already installed on the host
+ config.environment.etc.FLEET_HOST.text = "1";
}