difftreelog
feat nixos-install target
in: trunk
6 files changed
Cargo.lockdiffbeforeafterboth--- a/Cargo.lock
+++ b/Cargo.lock
@@ -924,7 +924,6 @@
"hostname",
"human-repr",
"indicatif",
- "indoc",
"itertools 0.13.0",
"nix-eval",
"nixlike",
@@ -958,6 +957,7 @@
"fleet-shared",
"futures",
"hostname",
+ "indoc",
"itertools 0.13.0",
"nix-eval",
"nixlike",
cmds/fleet/Cargo.tomldiffbeforeafterboth--- a/cmds/fleet/Cargo.toml
+++ b/cmds/fleet/Cargo.toml
@@ -47,7 +47,6 @@
nix-eval.workspace = true
nom = "7.1.3"
fleet-base = { version = "0.1.0", path = "../../crates/fleet-base" }
-indoc = "2.0.6"
[features]
default = ["indicatif"]
cmds/fleet/src/cmds/build_systems.rsdiffbeforeafterboth1use std::{env::current_dir, os::unix::fs::symlink, path::PathBuf, time::Duration};23use anyhow::{anyhow, bail, Result};4use clap::{Parser, ValueEnum};5use fleet_base::{6 host::{Config, ConfigHost, DeployKind},7 opts::FleetOpts,8};9use itertools::Itertools as _;10use nix_eval::{nix_go, NixBuildBatch};11use tokio::{task::LocalSet, time::sleep};12use tracing::{error, field, info, info_span, warn, Instrument};1314#[derive(Parser)]15pub struct Deploy {16 /// Disable automatic rollback17 #[clap(long)]18 disable_rollback: bool,19 /// Action to execute after system is built20 action: DeployAction,21}2223#[derive(ValueEnum, Clone, Copy)]24enum DeployAction {25 /// Upload derivation, but do not execute the update.26 Upload,27 /// Upload and execute the activation script, old version will be used after reboot.28 Test,29 /// Upload and set as current system profile, but do not execute activation script.30 Boot,31 /// Upload, set current profile, and execute activation script.32 Switch,33}3435impl DeployAction {36 pub(crate) fn name(&self) -> Option<&'static str> {37 match self {38 Self::Upload => None,39 Self::Test => Some("test"),40 Self::Boot => Some("boot"),41 Self::Switch => Some("switch"),42 }43 }44 pub(crate) fn should_switch_profile(&self) -> bool {45 matches!(self, Self::Switch | Self::Boot)46 }47 pub(crate) fn should_activate(&self) -> bool {48 matches!(self, Self::Switch | Self::Test | Self::Boot)49 }50 pub(crate) fn should_create_rollback_marker(&self) -> bool {51 // Upload does nothing on the target machine, other than uploading the closure.52 // In boot case we want to have rollback marker prepared, so that the system may rollback itself on the next boot.53 !matches!(self, Self::Upload)54 }55 pub(crate) fn should_schedule_rollback_run(&self) -> bool {56 matches!(self, Self::Switch | Self::Test)57 }58}5960#[derive(Parser, Clone)]61pub struct BuildSystems {62 /// Attribute to build. Systems are deployed from "toplevel" attr, well-known used attributes63 /// are "sdImage"/"isoImage", and your configuration may include any other build attributes.64 #[clap(long, default_value = "toplevel")]65 build_attr: String,66}6768struct Generation {69 id: u32,70 current: bool,71 datetime: String,72}7374fn parse_generation_line(g: &str) -> Option<Generation> {75 let mut parts = g.split_whitespace();76 let id = parts.next()?;77 let id: u32 = id.parse().ok()?;78 let date = parts.next()?;79 let time = parts.next()?;80 let current = if let Some(current) = parts.next() {81 if current == "(current)" {82 Some(true)83 } else {84 None85 }86 } else {87 Some(false)88 };89 let current = current?;90 if parts.next().is_some() {91 warn!("unexpected text after generation: {g}");92 }93 Some(Generation {94 id,95 current,96 datetime: format!("{date} {time}"),97 })98}99100async fn get_current_generation(host: &ConfigHost) -> Result<Generation> {101 let mut cmd = host.cmd("nix-env").await?;102 cmd.comparg("--profile", "/nix/var/nix/profiles/system")103 .arg("--list-generations");104 // Sudo is required due to --list-generations acquiring lock on the profile.105 let data = cmd.sudo().run_string().await?;106 let generations = data107 .split('\n')108 .map(|e| e.trim())109 .filter(|&l| !l.is_empty())110 .filter_map(|g| {111 let gen = parse_generation_line(g);112 if gen.is_none() {113 warn!("bad generation: {g}");114 }115 gen116 })117 .collect::<Vec<_>>();118 let current = generations119 .into_iter()120 .filter(|g| g.current)121 .at_most_one()122 .map_err(|_e| anyhow!("bad list-generations output"))?123 .ok_or_else(|| anyhow!("failed to find generation"))?;124 Ok(current)125}126127async fn deploy_task(128 action: DeployAction,129 host: &ConfigHost,130 built: PathBuf,131 specialisation: Option<String>,132 disable_rollback: bool,133) -> Result<()> {134 let deploy_kind = host.deploy_kind().await?;135 if deploy_kind == DeployKind::NixosInstall136 && !matches!(action, DeployAction::Boot | DeployAction::Upload)137 {138 bail!("nixos-install deploy kind only supports boot and upload actions");139 }140141 let mut failed = false;142143 // TODO: Lockfile, to prevent concurrent system switch?144 // TODO: If rollback target exists - bail, it should be removed. Lockfile will not work in case if rollback145 // is scheduler on next boot (default behavior). On current boot - rollback activator will fail due to146 // unit name conflict in systemd-run147 // This code is tied to rollback.nix148 if !disable_rollback && action.should_create_rollback_marker() {149 let _span = info_span!("preparing").entered();150 info!("preparing for rollback");151 let generation = get_current_generation(host).await?;152 info!(153 "rollback target would be {} {}",154 generation.id, generation.datetime155 );156 {157 let mut cmd = host.cmd("sh").await?;158 cmd.arg("-c").arg(format!("mark=$(mktemp -p /etc -t fleet_rollback_marker.XXXXX) && echo -n {} > $mark && mv --no-clobber $mark /etc/fleet_rollback_marker", generation.id));159 if let Err(e) = cmd.sudo().run().await {160 error!("failed to set rollback marker: {e}");161 failed = true;162 }163 }164 // Activation script also starts rollback-watchdog.timer, however, it is possible that it won't be started.165 // Kicking it on manually will work best.166 //167 // There wouldn't be conflict, because here we trigger start of the primary service, and systemd will168 // only allow one instance of it.169170 // TODO: We should also watch how this process is going.171 // After running this command, we have less than 3 minutes to deploy everything,172 // if we fail to perform generation switch in time, then we will still call the activation script, and this may break something.173 // Anyway, reboot will still help in this case.174 if action.should_schedule_rollback_run() {175 let mut cmd = host.cmd("systemd-run").await?;176 cmd.comparg("--on-active", "3min")177 .comparg("--unit", "rollback-watchdog-run")178 .arg("systemctl")179 .arg("start")180 .arg("rollback-watchdog.service");181 if let Err(e) = cmd.sudo().run().await {182 error!("failed to schedule rollback run: {e}");183 failed = true;184 }185 }186 }187 if deploy_kind == DeployKind::NixosInstall {188 info!(189 "running nixos-install to switch profile, install bootloader, and perform activation"190 );191 let mut cmd = host.cmd("nixos-install").await?;192 cmd.arg("--system").arg(&built).args([193 // Channels here aren't fleet host system channels, but channels embedded in installation cd, which might be old.194 // It is possible to copy host channels, but I would prefer non-flake nix just to be unsupported.195 "--no-channel-copy",196 "--root",197 "/mnt",198 ]);199 if let Err(e) = cmd.sudo().run().await {200 error!("failed to execute nixos-install: {e}");201 failed = true;202 }203 } else {204 if action.should_switch_profile() && !failed {205 info!("switching system profile generation");206207 // To avoid even more problems, using nixos-install for now.208 // // nix build is unable to work with --store argument for some reason, and nix until 2.26 didn't support copy with --profile argument,209 // // falling back to using nix-env command210 // // After stable NixOS starts using 2.26 - use `nix --store /mnt copy --from /mnt --profile ...` here, and instead of nix build below.211 // let mut cmd = host.cmd("nix-env").await?;212 // cmd.args([213 // "--store",214 // "/mnt",215 // "--profile",216 // "/mnt/nix/var/nix/profiles/system",217 // "--set",218 // ])219 // .arg(&built);220 // if let Err(e) = cmd.sudo().run_nix().await {221 // error!("failed to switch system profile generation: {e}");222 // failed = true;223 // }224 // It would also be possible to update profile atomically during copy:225 // https://github.com/NixOS/nix/pull/11657226 let mut cmd = host.nix_cmd().await?;227 cmd.arg("build");228 cmd.comparg("--profile", "/nix/var/nix/profiles/system");229 cmd.arg(&built);230 if let Err(e) = cmd.sudo().run_nix().await {231 error!("failed to switch system profile generation: {e}");232 failed = true;233 }234 }235236 // FIXME: Connection might be disconnected after activation run237238 if action.should_activate() && !failed {239 let _span = info_span!("activating").entered();240 info!("executing activation script");241 let specialised = if let Some(specialisation) = specialisation {242 let mut specialised = built.join("specialisation");243 specialised.push(specialisation);244 specialised245 } else {246 built.clone()247 };248 let switch_script = specialised.join("bin/switch-to-configuration");249 let mut cmd = host.cmd(switch_script).in_current_span().await?;250 cmd.arg(action.name().expect("upload.should_activate == false"));251 if let Err(e) = cmd.sudo().run().in_current_span().await {252 error!("failed to activate: {e}");253 failed = true;254 }255 }256 }257 if action.should_create_rollback_marker() {258 if !disable_rollback {259 if failed {260 if action.should_schedule_rollback_run() {261 info!("executing rollback");262 if let Err(e) = host263 .systemctl_start("rollback-watchdog.service")264 .instrument(info_span!("rollback"))265 .await266 {267 error!("failed to trigger rollback: {e}")268 }269 }270 } else {271 info!("trying to mark upgrade as successful");272 if let Err(e) = host273 .rm_file("/etc/fleet_rollback_marker", true)274 .in_current_span()275 .await276 {277 error!("failed to remove rollback marker. This is bad, as the system will be rolled back by watchdog: {e}")278 }279 }280 info!("disarming watchdog, just in case");281 if let Err(_e) = host.systemctl_stop("rollback-watchdog.timer").await {282 // It is ok, if there was no reboot - then timer might not be running.283 }284 if action.should_schedule_rollback_run() {285 if let Err(e) = host.systemctl_stop("rollback-watchdog-run.timer").await {286 error!("failed to disarm rollback run: {e}");287 }288 }289 } else if let Err(_e) = host290 .rm_file("/etc/fleet_rollback_marker", true)291 .in_current_span()292 .await293 {294 // Marker might not exist, yet better try to remove it.295 }296 }297 Ok(())298}299300async fn build_task(301 config: Config,302 hostname: String,303 build_attr: &str,304 batch: Option<NixBuildBatch>,305) -> Result<PathBuf> {306 info!("building");307 let host = config.host(&hostname).await?;308 // let action = Action::from(self.subcommand.clone());309 let nixos = host.nixos_config().await?;310 let drv = nix_go!(nixos.system.build[{ build_attr }]);311 let outputs = drv.build_maybe_batch(batch).await?;312 let out_output = outputs313 .get("out")314 .ok_or_else(|| anyhow!("system build should produce \"out\" output"))?;315316 {317 info!("adding gc root");318 let mut cmd = config.local_host().cmd("nix").await?;319 cmd.arg("build")320 .comparg(321 "--profile",322 format!(323 "/nix/var/nix/profiles/{}-{hostname}",324 config.data().gc_root_prefix325 ),326 )327 .arg(out_output);328 cmd.sudo().run_nix().await?;329 }330331 Ok(out_output.clone())332}333334impl BuildSystems {335 pub async fn run(self, config: &Config, opts: &FleetOpts) -> Result<()> {336 let hosts = opts.filter_skipped(config.list_hosts().await?).await?;337 let set = LocalSet::new();338 let build_attr = self.build_attr.clone();339 let batch = (hosts.len() > 1).then(|| {340 config341 .nix_session342 .new_build_batch("build-hosts".to_string())343 });344 for host in hosts {345 let config = config.clone();346 let span = info_span!("build", host = field::display(&host.name));347 let hostname = host.name;348 let build_attr = build_attr.clone();349 let batch = batch.clone();350 set.spawn_local(351 (async move {352 let built = match build_task(config, hostname.clone(), &build_attr, batch).await353 {354 Ok(path) => path,355 Err(e) => {356 error!("failed to deploy host: {}", e);357 return;358 }359 };360 // TODO: Handle error361 let mut out = current_dir().expect("cwd exists");362 out.push(format!("built-{}", hostname));363364 info!("linking iso image to {:?}", out);365 if let Err(e) = symlink(built, out) {366 error!("failed to symlink: {e}")367 }368 })369 .instrument(span),370 );371 }372 drop(batch);373 set.await;374 Ok(())375 }376}377378impl Deploy {379 pub async fn run(self, config: &Config, opts: &FleetOpts) -> Result<()> {380 let hosts = opts.filter_skipped(config.list_hosts().await?).await?;381 let set = LocalSet::new();382 let batch = (hosts.len() > 1).then(|| {383 config384 .nix_session385 .new_build_batch("deploy-hosts".to_string())386 });387 for host in hosts.into_iter() {388 let config = config.clone();389 let span = info_span!("deploy", host = field::display(&host.name));390 let hostname = host.name.clone();391 let local_host = config.local_host();392 let opts = opts.clone();393 let batch = batch.clone();394 if let Some(deploy_kind) = opts.action_attr::<DeployKind>(&host, "deploy_kind").await? {395 host.set_deploy_kind(deploy_kind);396 };397398 set.spawn_local(399 (async move {400 let built =401 match build_task(config.clone(), hostname.clone(), "toplevel", batch).await402 {403 Ok(path) => path,404 Err(e) => {405 error!("failed to build host system closure: {}", e);406 return;407 }408 };409410 let deploy_kind = match host.deploy_kind().await {411 Ok(v) => v,412 Err(e) => {413 error!("failed to query target deploy kind: {e}");414 return;415 }416 };417418 // TODO: Make disable_rollback a host attribute instead419 let mut disable_rollback = self.disable_rollback;420 if !disable_rollback && deploy_kind != DeployKind::Fleet {421 warn!("disabling rollback, as not supported by non-fleet deployment kinds");422 disable_rollback = true;423 }424425 if !opts.is_local(&hostname) {426 info!("uploading system closure");427 {428 // TODO: Move to remote_derivation method.429 // Alternatively, nix store make-content-addressed can be used,430 // at least for the first deployment, to provide trusted store key.431 //432 // It is much slower, yet doesn't require root on the deployer machine.433 let Ok(mut sign) = local_host.cmd("nix").await else {434 error!("failed to setup local");435 return;436 };437 // Private key for host machine is registered in nix-sign.nix438 sign.arg("store")439 .arg("sign")440 .comparg("--key-file", "/etc/nix/private-key")441 .arg("-r")442 .arg(&built);443 if let Err(e) = sign.sudo().run_nix().await {444 warn!("failed to sign store paths: {e}");445 };446 }447 let mut tries = 0;448 loop {449 match host.remote_derivation(&built).await {450 Ok(remote) => {451 assert!(remote == built, "CA derivations aren't implemented");452 break;453 }454 Err(e) if tries < 3 => {455 tries += 1;456 warn!("copy failure ({}/3): {}", tries, e);457 sleep(Duration::from_millis(5000)).await;458 }459 Err(e) => {460 error!("upload failed: {e}");461 return;462 }463 }464 }465 }466 if let Err(e) = deploy_task(467 self.action,468 &host,469 built,470 if let Ok(v) = opts.action_attr(&host, "specialisation").await {471 v472 } else {473 error!("unreachable? failed to get specialization");474 return;475 },476 disable_rollback,477 )478 .await479 {480 error!("activation failed: {e}");481 }482 })483 .instrument(span),484 );485 }486 drop(batch);487 set.await;488 Ok(())489 }490}crates/fleet-base/Cargo.tomldiffbeforeafterboth--- a/crates/fleet-base/Cargo.toml
+++ b/crates/fleet-base/Cargo.toml
@@ -13,6 +13,7 @@
fleet-shared.workspace = true
futures = "0.3.30"
hostname = "0.4.0"
+indoc = "2.0.6"
itertools = "0.13.0"
nix-eval.workspace = true
nixlike.workspace = true
crates/fleet-base/src/host.rsdiffbeforeafterboth--- a/crates/fleet-base/src/host.rs
+++ b/crates/fleet-base/src/host.rs
@@ -58,11 +58,35 @@
Su,
}
+#[derive(Clone, PartialEq, Copy)]
+pub enum DeployKind {
+ /// NixOS => NixOS managed by fleet
+ UpgradeToFleet,
+ /// NixOS managed by fleet => NixOS managed by fleet
+ Fleet,
+ /// Remote host has /mnt, /mnt/boot mounted,
+ /// generated config is added to fleet configuration.
+ NixosInstall,
+}
+
+impl FromStr for DeployKind {
+ type Err = anyhow::Error;
+ fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
+ match s {
+ "upgrade-to-fleet" => Ok(Self::UpgradeToFleet),
+ "fleet" => Ok(Self::Fleet),
+ "nixos-install" => Ok(Self::NixosInstall),
+ v => bail!("unknown deploy_kind: {v}; expected on of \"upgrade-to-fleet\", \"fleet\", \"nixos-install\""),
+ }
+ }
+}
pub struct ConfigHost {
config: Config,
pub name: String,
groups: OnceCell<Vec<String>>,
+ deploy_kind: OnceCell<DeployKind>,
+
pub host_config: Option<Value>,
pub nixos_config: OnceCell<Value>,
pub pkgs_override: Option<Value>,
@@ -73,6 +97,40 @@
}
// TODO: Move command helpers away with connectivity refactor
impl ConfigHost {
+ pub fn set_deploy_kind(&self, kind: DeployKind) {
+ self.deploy_kind
+ .set(kind)
+ .ok()
+ .expect("deploy kind is already set");
+ }
+ pub async fn deploy_kind(&self) -> Result<DeployKind> {
+ if let Some(kind) = self.deploy_kind.get() {
+ return Ok(kind.clone());
+ }
+ let is_fleet_managed = match self.file_exists("/etc/FLEET_HOST").await {
+ Ok(v) => v,
+ Err(e) => {
+ bail!("failed to query remote system kind: {}", e);
+ }
+ };
+ if !is_fleet_managed {
+ bail!(indoc::indoc! {"
+ host is not marked as managed by fleet
+ if you're not trying to lustrate/install system from scratch,
+ you should either
+ 1. manually create /etc/FLEET_HOST file on the target host,
+ 2. use ?deploy_kind=fleet host argument if you're upgrading from older version of fleet
+ 3. use ?deploy_kind=upgrade_to_fleet if you're upgrading from plain nixos to fleet-managed nixos
+ "});
+ }
+ // TOCTOU is possible
+ let _ = self.deploy_kind.set(DeployKind::Fleet);
+ Ok(self
+ .deploy_kind
+ .get()
+ .expect("deploy kind is just set")
+ .clone())
+ }
pub async fn escalation_strategy(&self) -> Result<EscalationStrategy> {
// Prefer sudo, as run0 has some gotchas with polkit
// and too many repeating prompts.
@@ -189,6 +247,16 @@
Ok(MyCommand::new_on(escalation, cmd, session))
}
}
+ pub async fn nix_cmd(&self) -> Result<MyCommand> {
+ let mut nix = self.cmd("nix").await?;
+ nix.args([
+ "--extra-experimental-features",
+ "nix-command",
+ "--extra-experimental-features",
+ "flakes",
+ ]);
+ Ok(nix)
+ }
pub async fn decrypt(&self, data: SecretData) -> Result<Vec<u8>> {
ensure!(data.encrypted, "secret is not encrypted");
@@ -231,10 +299,23 @@
EscalationStrategy::Su,
"nix",
);
- nix.arg("copy")
- .arg("--substitute-on-destination")
- .comparg("--to", format!("ssh-ng://{}", self.name))
- .arg(path);
+ nix.arg("copy").arg("--substitute-on-destination");
+
+ match self.deploy_kind().await? {
+ DeployKind::Fleet | DeployKind::UpgradeToFleet => {
+ nix.comparg("--to", format!("ssh-ng://{}", self.name));
+ }
+ DeployKind::NixosInstall => {
+ nix
+ // Signature checking makes no sense with remote-store store argument set, as we're not even interacting with remote nix daemon
+ .arg("--no-check-sigs")
+ .comparg(
+ "--to",
+ format!("ssh-ng://root@{}-install?remote-store=/mnt", self.name),
+ );
+ }
+ }
+ nix.arg(path);
nix.run_nix().await.context("nix copy")?;
Ok(path.to_owned())
}
@@ -354,6 +435,7 @@
local: true,
session: OnceLock::new(),
+ deploy_kind: OnceCell::new(),
}
}
@@ -372,6 +454,7 @@
// TODO: Remove with connectivit refactor
local: self.localhost == name,
session: OnceLock::new(),
+ deploy_kind: OnceCell::new(),
})
}
pub async fn list_hosts(&self) -> Result<Vec<ConfigHost>> {
modules/nixos/meta.nixdiffbeforeafterboth--- a/modules/nixos/meta.nix
+++ b/modules/nixos/meta.nix
@@ -13,5 +13,13 @@
];
# Version of environment (fleet scripts such as rollback) already installed on the host
- config.environment.etc.FLEET_HOST.text = "1";
+ config = {
+ environment.etc.FLEET_HOST.text = "1";
+
+ # Flake/nix command support is assumed by fleet, lets add it here to avoid potential problems.
+ nix.settings.experimental-features = [
+ "nix-command"
+ "flakes"
+ ];
+ };
}