difftreelog
feat explicitly mark hosts as managed by fleet
in: trunk
7 files changed
Cargo.lockdiffbeforeafterboth--- a/Cargo.lock
+++ b/Cargo.lock
@@ -924,6 +924,7 @@
"hostname",
"human-repr",
"indicatif",
+ "indoc",
"itertools 0.13.0",
"nix-eval",
"nixlike",
@@ -1537,6 +1538,12 @@
]
[[package]]
+name = "indoc"
+version = "2.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd"
+
+[[package]]
name = "inout"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
cmds/fleet/Cargo.tomldiffbeforeafterboth--- a/cmds/fleet/Cargo.toml
+++ b/cmds/fleet/Cargo.toml
@@ -47,6 +47,7 @@
nix-eval.workspace = true
nom = "7.1.3"
fleet-base = { version = "0.1.0", path = "../../crates/fleet-base" }
+indoc = "2.0.6"
[features]
default = ["indicatif"]
cmds/fleet/src/cmds/build_systems.rsdiffbeforeafterboth1use std::{env::current_dir, os::unix::fs::symlink, path::PathBuf, time::Duration};23use anyhow::{anyhow, Result};4use clap::{Parser, ValueEnum};5use fleet_base::{6 host::{Config, ConfigHost},7 opts::FleetOpts,8};9use itertools::Itertools as _;10use nix_eval::{nix_go, NixBuildBatch};11use tokio::{task::LocalSet, time::sleep};12use tracing::{error, field, info, info_span, warn, Instrument};1314#[derive(Parser)]15pub struct Deploy {16 /// Disable automatic rollback17 #[clap(long)]18 disable_rollback: bool,19 /// Action to execute after system is built20 action: DeployAction,21}2223#[derive(ValueEnum, Clone, Copy)]24enum DeployAction {25 /// Upload derivation, but do not execute the update.26 Upload,27 /// Upload and execute the activation script, old version will be used after reboot.28 Test,29 /// Upload and set as current system profile, but do not execute activation script.30 Boot,31 /// Upload, set current profile, and execute activation script.32 Switch,33}3435impl DeployAction {36 pub(crate) fn name(&self) -> Option<&'static str> {37 match self {38 Self::Upload => None,39 Self::Test => Some("test"),40 Self::Boot => Some("boot"),41 Self::Switch => Some("switch"),42 }43 }44 pub(crate) fn should_switch_profile(&self) -> bool {45 matches!(self, Self::Switch | Self::Boot)46 }47 pub(crate) fn should_activate(&self) -> bool {48 matches!(self, Self::Switch | Self::Test | Self::Boot)49 }50 pub(crate) fn should_create_rollback_marker(&self) -> bool {51 // Upload does nothing on the target machine, other than uploading the closure.52 // In boot case we want to have rollback marker prepared, so that the system may rollback itself on the next boot.53 !matches!(self, Self::Upload)54 }55 pub(crate) fn should_schedule_rollback_run(&self) -> bool {56 matches!(self, Self::Switch | Self::Test)57 }58}5960#[derive(Parser, Clone)]61pub struct BuildSystems {62 /// Attribute to build. Systems are deployed from "toplevel" attr, well-known used attributes63 /// are "sdImage"/"isoImage", and your configuration may include any other build attributes.64 #[clap(long, default_value = "toplevel")]65 build_attr: String,66}6768struct Generation {69 id: u32,70 current: bool,71 datetime: String,72}7374fn parse_generation_line(g: &str) -> Option<Generation> {75 let mut parts = g.split_whitespace();76 let id = parts.next()?;77 let id: u32 = id.parse().ok()?;78 let date = parts.next()?;79 let time = parts.next()?;80 let current = if let Some(current) = parts.next() {81 if current == "(current)" {82 Some(true)83 } else {84 None85 }86 } else {87 Some(false)88 };89 let current = current?;90 if parts.next().is_some() {91 warn!("unexpected text after generation: {g}");92 }93 Some(Generation {94 id,95 current,96 datetime: format!("{date} {time}"),97 })98}99100async fn get_current_generation(host: &ConfigHost) -> Result<Generation> {101 let mut cmd = host.cmd("nix-env").await?;102 cmd.comparg("--profile", "/nix/var/nix/profiles/system")103 .arg("--list-generations");104 // Sudo is required due to --list-generations acquiring lock on the profile.105 let data = cmd.sudo().run_string().await?;106 let generations = data107 .split('\n')108 .map(|e| e.trim())109 .filter(|&l| !l.is_empty())110 .filter_map(|g| {111 let gen = parse_generation_line(g);112 if gen.is_none() {113 warn!("bad generation: {g}");114 }115 gen116 })117 .collect::<Vec<_>>();118 let current = generations119 .into_iter()120 .filter(|g| g.current)121 .at_most_one()122 .map_err(|_e| anyhow!("bad list-generations output"))?123 .ok_or_else(|| anyhow!("failed to find generation"))?;124 Ok(current)125}126127async fn deploy_task(128 action: DeployAction,129 host: &ConfigHost,130 built: PathBuf,131 specialisation: Option<String>,132 disable_rollback: bool,133) -> Result<()> {134 let mut failed = false;135 // TODO: Lockfile, to prevent concurrent system switch?136 // TODO: If rollback target exists - bail, it should be removed. Lockfile will not work in case if rollback137 // is scheduler on next boot (default behavior). On current boot - rollback activator will fail due to138 // unit name conflict in systemd-run139 // This code is tied to rollback.nix140 if !disable_rollback && action.should_create_rollback_marker() {141 let _span = info_span!("preparing").entered();142 info!("preparing for rollback");143 let generation = get_current_generation(host).await?;144 info!(145 "rollback target would be {} {}",146 generation.id, generation.datetime147 );148 {149 let mut cmd = host.cmd("sh").await?;150 cmd.arg("-c").arg(format!("mark=$(mktemp -p /etc -t fleet_rollback_marker.XXXXX) && echo -n {} > $mark && mv --no-clobber $mark /etc/fleet_rollback_marker", generation.id));151 if let Err(e) = cmd.sudo().run().await {152 error!("failed to set rollback marker: {e}");153 failed = true;154 }155 }156 // Activation script also starts rollback-watchdog.timer, however, it is possible that it won't be started.157 // Kicking it on manually will work best.158 //159 // There wouldn't be conflict, because here we trigger start of the primary service, and systemd will160 // only allow one instance of it.161162 // TODO: We should also watch how this process is going.163 // After running this command, we have less than 3 minutes to deploy everything,164 // if we fail to perform generation switch in time, then we will still call the activation script, and this may break something.165 // Anyway, reboot will still help in this case.166 if action.should_schedule_rollback_run() {167 let mut cmd = host.cmd("systemd-run").await?;168 cmd.comparg("--on-active", "3min")169 .comparg("--unit", "rollback-watchdog-run")170 .arg("systemctl")171 .arg("start")172 .arg("rollback-watchdog.service");173 if let Err(e) = cmd.sudo().run().await {174 error!("failed to schedule rollback run: {e}");175 failed = true;176 }177 }178 }179180 if action.should_switch_profile() && !failed {181 info!("switching system profile generation");182 // It would also be possible to update profile atomically during copy:183 // https://github.com/NixOS/nix/pull/11657184 let mut cmd = host.cmd("nix").await?;185 cmd.arg("build");186 cmd.comparg("--profile", "/nix/var/nix/profiles/system");187 cmd.arg(&built);188 if let Err(e) = cmd.sudo().run_nix().await {189 error!("failed to switch system profile generation: {e}");190 failed = true;191 }192 }193194 // FIXME: Connection might be disconnected after activation run195196 if action.should_activate() && !failed {197 let _span = info_span!("activating").entered();198 info!("executing activation script");199 let specialised = if let Some(specialisation) = specialisation {200 let mut specialised = built.join("specialisation");201 specialised.push(specialisation);202 specialised203 } else {204 built.clone()205 };206 let switch_script = specialised.join("bin/switch-to-configuration");207 let mut cmd = host.cmd(switch_script).in_current_span().await?;208 cmd.arg(action.name().expect("upload.should_activate == false"));209 if let Err(e) = cmd.sudo().run().in_current_span().await {210 error!("failed to activate: {e}");211 failed = true;212 }213 }214 if action.should_create_rollback_marker() {215 if !disable_rollback {216 if failed {217 if action.should_schedule_rollback_run() {218 info!("executing rollback");219 if let Err(e) = host220 .systemctl_start("rollback-watchdog.service")221 .instrument(info_span!("rollback"))222 .await223 {224 error!("failed to trigger rollback: {e}")225 }226 }227 } else {228 info!("trying to mark upgrade as successful");229 if let Err(e) = host230 .rm_file("/etc/fleet_rollback_marker", true)231 .in_current_span()232 .await233 {234 error!("failed to remove rollback marker. This is bad, as the system will be rolled back by watchdog: {e}")235 }236 }237 info!("disarming watchdog, just in case");238 if let Err(_e) = host.systemctl_stop("rollback-watchdog.timer").await {239 // It is ok, if there was no reboot - then timer might not be running.240 }241 if action.should_schedule_rollback_run() {242 if let Err(e) = host.systemctl_stop("rollback-watchdog-run.timer").await {243 error!("failed to disarm rollback run: {e}");244 }245 }246 } else if let Err(_e) = host247 .rm_file("/etc/fleet_rollback_marker", true)248 .in_current_span()249 .await250 {251 // Marker might not exist, yet better try to remove it.252 }253 }254 Ok(())255}256257async fn build_task(258 config: Config,259 hostname: String,260 build_attr: &str,261 batch: Option<NixBuildBatch>,262) -> Result<PathBuf> {263 info!("building");264 let host = config.host(&hostname).await?;265 // let action = Action::from(self.subcommand.clone());266 let nixos = host.nixos_config().await?;267 let drv = nix_go!(nixos.system.build[{ build_attr }]);268 let outputs = drv.build_maybe_batch(batch).await?;269 let out_output = outputs270 .get("out")271 .ok_or_else(|| anyhow!("system build should produce \"out\" output"))?;272273 {274 info!("adding gc root");275 let mut cmd = config.local_host().cmd("nix").await?;276 cmd.arg("build")277 .comparg(278 "--profile",279 format!(280 "/nix/var/nix/profiles/{}-{hostname}",281 config.data().gc_root_prefix282 ),283 )284 .arg(out_output);285 cmd.sudo().run_nix().await?;286 }287288 Ok(out_output.clone())289}290291impl BuildSystems {292 pub async fn run(self, config: &Config, opts: &FleetOpts) -> Result<()> {293 let hosts = opts.filter_skipped(config.list_hosts().await?).await?;294 let set = LocalSet::new();295 let build_attr = self.build_attr.clone();296 let batch = (hosts.len() > 1).then(|| {297 config298 .nix_session299 .new_build_batch("build-hosts".to_string())300 });301 for host in hosts {302 let config = config.clone();303 let span = info_span!("build", host = field::display(&host.name));304 let hostname = host.name;305 let build_attr = build_attr.clone();306 let batch = batch.clone();307 set.spawn_local(308 (async move {309 let built = match build_task(config, hostname.clone(), &build_attr, batch).await310 {311 Ok(path) => path,312 Err(e) => {313 error!("failed to deploy host: {}", e);314 return;315 }316 };317 // TODO: Handle error318 let mut out = current_dir().expect("cwd exists");319 out.push(format!("built-{}", hostname));320321 info!("linking iso image to {:?}", out);322 if let Err(e) = symlink(built, out) {323 error!("failed to symlink: {e}")324 }325 })326 .instrument(span),327 );328 }329 drop(batch);330 set.await;331 Ok(())332 }333}334335impl Deploy {336 pub async fn run(self, config: &Config, opts: &FleetOpts) -> Result<()> {337 let hosts = opts.filter_skipped(config.list_hosts().await?).await?;338 let set = LocalSet::new();339 let batch = (hosts.len() > 1).then(|| {340 config341 .nix_session342 .new_build_batch("deploy-hosts".to_string())343 });344 for host in hosts.into_iter() {345 let config = config.clone();346 let span = info_span!("deploy", host = field::display(&host.name));347 let hostname = host.name.clone();348 let local_host = config.local_host();349 let opts = opts.clone();350 let batch = batch.clone();351352 set.spawn_local(353 (async move {354 let built =355 match build_task(config.clone(), hostname.clone(), "toplevel", batch).await356 {357 Ok(path) => path,358 Err(e) => {359 error!("failed to deploy host: {}", e);360 return;361 }362 };363 if !opts.is_local(&hostname) {364 info!("uploading system closure");365 {366 // TODO: Move to remote_derivation method.367 // Alternatively, nix store make-content-addressed can be used,368 // at least for the first deployment, to provide trusted store key.369 //370 // It is much slower, yet doesn't require root on the deployer machine.371 let Ok(mut sign) = local_host.cmd("nix").await else {372 error!("failed to setup local");373 return;374 };375 // Private key for host machine is registered in nix-sign.nix376 sign.arg("store")377 .arg("sign")378 .comparg("--key-file", "/etc/nix/private-key")379 .arg("-r")380 .arg(&built);381 if let Err(e) = sign.sudo().run_nix().await {382 warn!("failed to sign store paths: {e}");383 };384 }385 let mut tries = 0;386 loop {387 match host.remote_derivation(&built).await {388 Ok(remote) => {389 assert!(remote == built, "CA derivations aren't implemented");390 break;391 }392 Err(e) if tries < 3 => {393 tries += 1;394 warn!("copy failure ({}/3): {}", tries, e);395 sleep(Duration::from_millis(5000)).await;396 }397 Err(e) => {398 error!("upload failed: {e}");399 return;400 }401 }402 }403 }404 if let Err(e) = deploy_task(405 self.action,406 &host,407 built,408 if let Ok(v) = opts.action_attr(&host, "specialisation").await {409 v410 } else {411 error!("unreachable? failed to get specialization");412 return;413 },414 self.disable_rollback,415 )416 .await417 {418 error!("activation failed: {e}");419 }420 })421 .instrument(span),422 );423 }424 drop(batch);425 set.await;426 Ok(())427 }428}cmds/fleet/src/main.rsdiffbeforeafterboth--- a/cmds/fleet/src/main.rs
+++ b/cmds/fleet/src/main.rs
@@ -66,9 +66,9 @@
#[derive(Parser)]
enum Opts {
- /// Prepare systems for deployments
+ /// Build system closures
BuildSystems(BuildSystems),
-
+ /// Upload and switch system closures
Deploy(Deploy),
/// Secret management
#[clap(subcommand)]
crates/fleet-base/src/command.rsdiffbeforeafterboth--- a/crates/fleet-base/src/command.rs
+++ b/crates/fleet-base/src/command.rs
@@ -5,6 +5,7 @@
use futures::StreamExt;
use itertools::Either;
use openssh::{OverSsh, OwningCommand, Session};
+use serde::de::DeserializeOwned;
use tokio::{io::AsyncRead, process::Command, select};
use tokio_util::codec::{BytesCodec, FramedRead, LinesCodec};
use tracing::debug;
@@ -230,6 +231,10 @@
let bytes = self.run_bytes().await?;
Ok(String::from_utf8(bytes)?)
}
+ pub async fn run_value<T: DeserializeOwned>(self) -> Result<T> {
+ let v = self.run_string().await?;
+ Ok(serde_json::from_str(&v)?)
+ }
pub async fn run_bytes(self) -> Result<Vec<u8>> {
let str = self.clone().into_string();
let cmd = self.wrap_sudo_if_needed().into_command()?;
crates/fleet-base/src/host.rsdiffbeforeafterboth--- a/crates/fleet-base/src/host.rs
+++ b/crates/fleet-base/src/host.rs
@@ -105,6 +105,14 @@
let path = cmd.run_string().await?;
Ok(path.trim_end().to_owned())
}
+ pub async fn file_exists(&self, path: impl AsRef<OsStr>) -> Result<bool> {
+ let mut cmd = self.cmd("sh").await?;
+ cmd.arg("-c")
+ .arg("test -e \"$1\" && echo true || echo false")
+ .arg("_")
+ .arg(path);
+ Ok(cmd.run_value().await?)
+ }
pub async fn read_file_bin(&self, path: impl AsRef<OsStr>) -> Result<Vec<u8>> {
let mut cmd = self.cmd("cat").await?;
cmd.arg(path);
modules/nixos/meta.nixdiffbeforeafterboth--- a/modules/nixos/meta.nix
+++ b/modules/nixos/meta.nix
@@ -1,8 +1,17 @@
-{lib, ...}: let
+{ lib, ... }:
+let
inherit (lib.modules) mkRemovedOptionModule;
-in {
+in
+{
imports = [
- (mkRemovedOptionModule ["tags"] "tags are now defined at the host level, not the nixos system level for fast filtering without evaluating unnecessary hosts.")
- (mkRemovedOptionModule ["network"] "network is now defined at the host level, not the nixos system level")
+ (mkRemovedOptionModule [ "tags" ]
+ "tags are now defined at the host level, not the nixos system level for fast filtering without evaluating unnecessary hosts."
+ )
+ (mkRemovedOptionModule [
+ "network"
+ ] "network is now defined at the host level, not the nixos system level")
];
+
+ # Version of environment (fleet scripts such as rollback) already installed on the host
+ config.environment.etc.FLEET_HOST.text = "1";
}