difftreelog
feat basic lustration helper
in: trunk
3 files changed
cmds/fleet/src/cmds/build_systems.rsdiffbeforeafterboth1use std::{env::current_dir, os::unix::fs::symlink, path::PathBuf, time::Duration};23use anyhow::{anyhow, bail, Result};4use clap::{Parser, ValueEnum};5use fleet_base::{6 host::{Config, ConfigHost, DeployKind},7 opts::FleetOpts,8};9use itertools::Itertools as _;10use nix_eval::{nix_go, NixBuildBatch};11use tokio::{task::LocalSet, time::sleep};12use tracing::{error, field, info, info_span, warn, Instrument};1314#[derive(Parser)]15pub struct Deploy {16 /// Disable automatic rollback17 #[clap(long)]18 disable_rollback: bool,19 /// Action to execute after system is built20 action: DeployAction,21}2223#[derive(ValueEnum, Clone, Copy)]24enum DeployAction {25 /// Upload derivation, but do not execute the update.26 Upload,27 /// Upload and execute the activation script, old version will be used after reboot.28 Test,29 /// Upload and set as current system profile, but do not execute activation script.30 Boot,31 /// Upload, set current profile, and execute activation script.32 Switch,33}3435impl DeployAction {36 pub(crate) fn name(&self) -> Option<&'static str> {37 match self {38 Self::Upload => None,39 Self::Test => Some("test"),40 Self::Boot => Some("boot"),41 Self::Switch => Some("switch"),42 }43 }44 pub(crate) fn should_switch_profile(&self) -> bool {45 matches!(self, Self::Switch | Self::Boot)46 }47 pub(crate) fn should_activate(&self) -> bool {48 matches!(self, Self::Switch | Self::Test | Self::Boot)49 }50 pub(crate) fn should_create_rollback_marker(&self) -> bool {51 // Upload does nothing on the target machine, other than uploading the closure.52 // In boot case we want to have rollback marker prepared, so that the system may rollback itself on the next boot.53 !matches!(self, Self::Upload)54 }55 pub(crate) fn should_schedule_rollback_run(&self) -> bool {56 matches!(self, Self::Switch | Self::Test)57 }58}5960#[derive(Parser, Clone)]61pub struct BuildSystems {62 /// Attribute to build. Systems are deployed from "toplevel" attr, well-known used attributes63 /// are "sdImage"/"isoImage", and your configuration may include any other build attributes.64 #[clap(long, default_value = "toplevel")]65 build_attr: String,66}6768struct Generation {69 id: u32,70 current: bool,71 datetime: String,72}7374fn parse_generation_line(g: &str) -> Option<Generation> {75 let mut parts = g.split_whitespace();76 let id = parts.next()?;77 let id: u32 = id.parse().ok()?;78 let date = parts.next()?;79 let time = parts.next()?;80 let current = if let Some(current) = parts.next() {81 if current == "(current)" {82 Some(true)83 } else {84 None85 }86 } else {87 Some(false)88 };89 let current = current?;90 if parts.next().is_some() {91 warn!("unexpected text after generation: {g}");92 }93 Some(Generation {94 id,95 current,96 datetime: format!("{date} {time}"),97 })98}99100async fn get_current_generation(host: &ConfigHost) -> Result<Generation> {101 let mut cmd = host.cmd("nix-env").await?;102 cmd.comparg("--profile", "/nix/var/nix/profiles/system")103 .arg("--list-generations");104 // Sudo is required due to --list-generations acquiring lock on the profile.105 let data = cmd.sudo().run_string().await?;106 let generations = data107 .split('\n')108 .map(|e| e.trim())109 .filter(|&l| !l.is_empty())110 .filter_map(|g| {111 let gen = parse_generation_line(g);112 if gen.is_none() {113 warn!("bad generation: {g}");114 }115 gen116 })117 .collect::<Vec<_>>();118 let current = generations119 .into_iter()120 .filter(|g| g.current)121 .at_most_one()122 .map_err(|_e| anyhow!("bad list-generations output"))?123 .ok_or_else(|| anyhow!("failed to find generation"))?;124 Ok(current)125}126127async fn deploy_task(128 action: DeployAction,129 host: &ConfigHost,130 built: PathBuf,131 specialisation: Option<String>,132 disable_rollback: bool,133) -> Result<()> {134 let deploy_kind = host.deploy_kind().await?;135 if deploy_kind == DeployKind::NixosInstall136 && !matches!(action, DeployAction::Boot | DeployAction::Upload)137 {138 bail!("nixos-install deploy kind only supports boot and upload actions");139 }140141 let mut failed = false;142143 // TODO: Lockfile, to prevent concurrent system switch?144 // TODO: If rollback target exists - bail, it should be removed. Lockfile will not work in case if rollback145 // is scheduler on next boot (default behavior). On current boot - rollback activator will fail due to146 // unit name conflict in systemd-run147 // This code is tied to rollback.nix148 if !disable_rollback && action.should_create_rollback_marker() {149 let _span = info_span!("preparing").entered();150 info!("preparing for rollback");151 let generation = get_current_generation(host).await?;152 info!(153 "rollback target would be {} {}",154 generation.id, generation.datetime155 );156 {157 let mut cmd = host.cmd("sh").await?;158 cmd.arg("-c").arg(format!("mark=$(mktemp -p /etc -t fleet_rollback_marker.XXXXX) && echo -n {} > $mark && mv --no-clobber $mark /etc/fleet_rollback_marker", generation.id));159 if let Err(e) = cmd.sudo().run().await {160 error!("failed to set rollback marker: {e}");161 failed = true;162 }163 }164 // Activation script also starts rollback-watchdog.timer, however, it is possible that it won't be started.165 // Kicking it on manually will work best.166 //167 // There wouldn't be conflict, because here we trigger start of the primary service, and systemd will168 // only allow one instance of it.169170 // TODO: We should also watch how this process is going.171 // After running this command, we have less than 3 minutes to deploy everything,172 // if we fail to perform generation switch in time, then we will still call the activation script, and this may break something.173 // Anyway, reboot will still help in this case.174 if action.should_schedule_rollback_run() {175 let mut cmd = host.cmd("systemd-run").await?;176 cmd.comparg("--on-active", "3min")177 .comparg("--unit", "rollback-watchdog-run")178 .arg("systemctl")179 .arg("start")180 .arg("rollback-watchdog.service");181 if let Err(e) = cmd.sudo().run().await {182 error!("failed to schedule rollback run: {e}");183 failed = true;184 }185 }186 }187 if deploy_kind == DeployKind::NixosInstall {188 info!(189 "running nixos-install to switch profile, install bootloader, and perform activation"190 );191 let mut cmd = host.cmd("nixos-install").await?;192 cmd.arg("--system").arg(&built).args([193 // Channels here aren't fleet host system channels, but channels embedded in installation cd, which might be old.194 // It is possible to copy host channels, but I would prefer non-flake nix just to be unsupported.195 "--no-channel-copy",196 "--root",197 "/mnt",198 ]);199 if let Err(e) = cmd.sudo().run().await {200 error!("failed to execute nixos-install: {e}");201 failed = true;202 }203 } else {204 if action.should_switch_profile() && !failed {205 info!("switching system profile generation");206207 // To avoid even more problems, using nixos-install for now.208 // // nix build is unable to work with --store argument for some reason, and nix until 2.26 didn't support copy with --profile argument,209 // // falling back to using nix-env command210 // // After stable NixOS starts using 2.26 - use `nix --store /mnt copy --from /mnt --profile ...` here, and instead of nix build below.211 // let mut cmd = host.cmd("nix-env").await?;212 // cmd.args([213 // "--store",214 // "/mnt",215 // "--profile",216 // "/mnt/nix/var/nix/profiles/system",217 // "--set",218 // ])219 // .arg(&built);220 // if let Err(e) = cmd.sudo().run_nix().await {221 // error!("failed to switch system profile generation: {e}");222 // failed = true;223 // }224 // It would also be possible to update profile atomically during copy:225 // https://github.com/NixOS/nix/pull/11657226 let mut cmd = host.nix_cmd().await?;227 cmd.arg("build");228 cmd.comparg("--profile", "/nix/var/nix/profiles/system");229 cmd.arg(&built);230 if let Err(e) = cmd.sudo().run_nix().await {231 error!("failed to switch system profile generation: {e}");232 failed = true;233 }234 }235236 // FIXME: Connection might be disconnected after activation run237238 if action.should_activate() && !failed {239 let _span = info_span!("activating").entered();240 info!("executing activation script");241 let specialised = if let Some(specialisation) = specialisation {242 let mut specialised = built.join("specialisation");243 specialised.push(specialisation);244 specialised245 } else {246 built.clone()247 };248 let switch_script = specialised.join("bin/switch-to-configuration");249 let mut cmd = host.cmd(switch_script).in_current_span().await?;250 cmd.env("FLEET_ONLINE_ACTIVATION", "1")251 .arg(action.name().expect("upload.should_activate == false"));252 if let Err(e) = cmd.sudo().run().in_current_span().await {253 error!("failed to activate: {e}");254 failed = true;255 }256 }257 }258 if action.should_create_rollback_marker() {259 if !disable_rollback {260 if failed {261 if action.should_schedule_rollback_run() {262 info!("executing rollback");263 if let Err(e) = host264 .systemctl_start("rollback-watchdog.service")265 .instrument(info_span!("rollback"))266 .await267 {268 error!("failed to trigger rollback: {e}")269 }270 }271 } else {272 info!("trying to mark upgrade as successful");273 if let Err(e) = host274 .rm_file("/etc/fleet_rollback_marker", true)275 .in_current_span()276 .await277 {278 error!("failed to remove rollback marker. This is bad, as the system will be rolled back by watchdog: {e}")279 }280 }281 info!("disarming watchdog, just in case");282 if let Err(_e) = host.systemctl_stop("rollback-watchdog.timer").await {283 // It is ok, if there was no reboot - then timer might not be running.284 }285 if action.should_schedule_rollback_run() {286 if let Err(e) = host.systemctl_stop("rollback-watchdog-run.timer").await {287 error!("failed to disarm rollback run: {e}");288 }289 }290 } else if let Err(_e) = host291 .rm_file("/etc/fleet_rollback_marker", true)292 .in_current_span()293 .await294 {295 // Marker might not exist, yet better try to remove it.296 }297 }298 Ok(())299}300301async fn build_task(302 config: Config,303 hostname: String,304 build_attr: &str,305 batch: Option<NixBuildBatch>,306) -> Result<PathBuf> {307 info!("building");308 let host = config.host(&hostname).await?;309 // let action = Action::from(self.subcommand.clone());310 let nixos = host.nixos_config().await?;311 let drv = nix_go!(nixos.system.build[{ build_attr }]);312 let outputs = drv.build_maybe_batch(batch).await?;313 let out_output = outputs314 .get("out")315 .ok_or_else(|| anyhow!("system build should produce \"out\" output"))?;316317 {318 info!("adding gc root");319 let mut cmd = config.local_host().cmd("nix").await?;320 cmd.arg("build")321 .comparg(322 "--profile",323 format!(324 "/nix/var/nix/profiles/{}-{hostname}",325 config.data().gc_root_prefix326 ),327 )328 .arg(out_output);329 cmd.sudo().run_nix().await?;330 }331332 Ok(out_output.clone())333}334335impl BuildSystems {336 pub async fn run(self, config: &Config, opts: &FleetOpts) -> Result<()> {337 let hosts = opts.filter_skipped(config.list_hosts().await?).await?;338 let set = LocalSet::new();339 let build_attr = self.build_attr.clone();340 let batch = (hosts.len() > 1).then(|| {341 config342 .nix_session343 .new_build_batch("build-hosts".to_string())344 });345 for host in hosts {346 let config = config.clone();347 let span = info_span!("build", host = field::display(&host.name));348 let hostname = host.name;349 let build_attr = build_attr.clone();350 let batch = batch.clone();351 set.spawn_local(352 (async move {353 let built = match build_task(config, hostname.clone(), &build_attr, batch).await354 {355 Ok(path) => path,356 Err(e) => {357 error!("failed to deploy host: {}", e);358 return;359 }360 };361 // TODO: Handle error362 let mut out = current_dir().expect("cwd exists");363 out.push(format!("built-{}", hostname));364365 info!("linking iso image to {:?}", out);366 if let Err(e) = symlink(built, out) {367 error!("failed to symlink: {e}")368 }369 })370 .instrument(span),371 );372 }373 drop(batch);374 set.await;375 Ok(())376 }377}378379impl Deploy {380 pub async fn run(self, config: &Config, opts: &FleetOpts) -> Result<()> {381 let hosts = opts.filter_skipped(config.list_hosts().await?).await?;382 let set = LocalSet::new();383 let batch = (hosts.len() > 1).then(|| {384 config385 .nix_session386 .new_build_batch("deploy-hosts".to_string())387 });388 for host in hosts.into_iter() {389 let config = config.clone();390 let span = info_span!("deploy", host = field::display(&host.name));391 let hostname = host.name.clone();392 let local_host = config.local_host();393 let opts = opts.clone();394 let batch = batch.clone();395 if let Some(deploy_kind) = opts.action_attr::<DeployKind>(&host, "deploy_kind").await? {396 host.set_deploy_kind(deploy_kind);397 };398399 set.spawn_local(400 (async move {401 let built =402 match build_task(config.clone(), hostname.clone(), "toplevel", batch).await403 {404 Ok(path) => path,405 Err(e) => {406 error!("failed to build host system closure: {}", e);407 return;408 }409 };410411 let deploy_kind = match host.deploy_kind().await {412 Ok(v) => v,413 Err(e) => {414 error!("failed to query target deploy kind: {e}");415 return;416 }417 };418419 // TODO: Make disable_rollback a host attribute instead420 let mut disable_rollback = self.disable_rollback;421 if !disable_rollback && deploy_kind != DeployKind::Fleet {422 warn!("disabling rollback, as not supported by non-fleet deployment kinds");423 disable_rollback = true;424 }425426 if !opts.is_local(&hostname) {427 info!("uploading system closure");428 {429 // TODO: Move to remote_derivation method.430 // Alternatively, nix store make-content-addressed can be used,431 // at least for the first deployment, to provide trusted store key.432 //433 // It is much slower, yet doesn't require root on the deployer machine.434 let Ok(mut sign) = local_host.cmd("nix").await else {435 error!("failed to setup local");436 return;437 };438 // Private key for host machine is registered in nix-sign.nix439 sign.arg("store")440 .arg("sign")441 .comparg("--key-file", "/etc/nix/private-key")442 .arg("-r")443 .arg(&built);444 if let Err(e) = sign.sudo().run_nix().await {445 warn!("failed to sign store paths: {e}");446 };447 }448 let mut tries = 0;449 loop {450 match host.remote_derivation(&built).await {451 Ok(remote) => {452 assert!(remote == built, "CA derivations aren't implemented");453 break;454 }455 Err(e) if tries < 3 => {456 tries += 1;457 warn!("copy failure ({}/3): {}", tries, e);458 sleep(Duration::from_millis(5000)).await;459 }460 Err(e) => {461 error!("upload failed: {e}");462 return;463 }464 }465 }466 }467 if let Err(e) = deploy_task(468 self.action,469 &host,470 built,471 if let Ok(v) = opts.action_attr(&host, "specialisation").await {472 v473 } else {474 error!("unreachable? failed to get specialization");475 return;476 },477 disable_rollback,478 )479 .await480 {481 error!("activation failed: {e}");482 }483 })484 .instrument(span),485 );486 }487 drop(batch);488 set.await;489 Ok(())490 }491}1use std::{env::current_dir, os::unix::fs::symlink, path::PathBuf, time::Duration};23use anyhow::{anyhow, bail, Context, Result};4use clap::{Parser, ValueEnum};5use fleet_base::{6 host::{Config, ConfigHost, DeployKind},7 opts::FleetOpts,8};9use itertools::Itertools as _;10use nix_eval::{nix_go, NixBuildBatch};11use tokio::{task::LocalSet, time::sleep};12use tracing::{error, field, info, info_span, warn, Instrument};1314#[derive(Parser)]15pub struct Deploy {16 /// Disable automatic rollback17 #[clap(long)]18 disable_rollback: bool,19 /// Action to execute after system is built20 action: DeployAction,21}2223#[derive(ValueEnum, Clone, Copy)]24enum DeployAction {25 /// Upload derivation, but do not execute the update.26 Upload,27 /// Upload and execute the activation script, old version will be used after reboot.28 Test,29 /// Upload and set as current system profile, but do not execute activation script.30 Boot,31 /// Upload, set current profile, and execute activation script.32 Switch,33}3435impl DeployAction {36 pub(crate) fn name(&self) -> Option<&'static str> {37 match self {38 Self::Upload => None,39 Self::Test => Some("test"),40 Self::Boot => Some("boot"),41 Self::Switch => Some("switch"),42 }43 }44 pub(crate) fn should_switch_profile(&self) -> bool {45 matches!(self, Self::Switch | Self::Boot)46 }47 pub(crate) fn should_activate(&self) -> bool {48 matches!(self, Self::Switch | Self::Test | Self::Boot)49 }50 pub(crate) fn should_create_rollback_marker(&self) -> bool {51 // Upload does nothing on the target machine, other than uploading the closure.52 // In boot case we want to have rollback marker prepared, so that the system may rollback itself on the next boot.53 !matches!(self, Self::Upload)54 }55 pub(crate) fn should_schedule_rollback_run(&self) -> bool {56 matches!(self, Self::Switch | Self::Test)57 }58}5960#[derive(Parser, Clone)]61pub struct BuildSystems {62 /// Attribute to build. Systems are deployed from "toplevel" attr, well-known used attributes63 /// are "sdImage"/"isoImage", and your configuration may include any other build attributes.64 #[clap(long, default_value = "toplevel")]65 build_attr: String,66}6768struct Generation {69 id: u32,70 current: bool,71 datetime: String,72}7374fn parse_generation_line(g: &str) -> Option<Generation> {75 let mut parts = g.split_whitespace();76 let id = parts.next()?;77 let id: u32 = id.parse().ok()?;78 let date = parts.next()?;79 let time = parts.next()?;80 let current = if let Some(current) = parts.next() {81 if current == "(current)" {82 Some(true)83 } else {84 None85 }86 } else {87 Some(false)88 };89 let current = current?;90 if parts.next().is_some() {91 warn!("unexpected text after generation: {g}");92 }93 Some(Generation {94 id,95 current,96 datetime: format!("{date} {time}"),97 })98}99100async fn get_current_generation(host: &ConfigHost) -> Result<Generation> {101 let mut cmd = host.cmd("nix-env").await?;102 cmd.comparg("--profile", "/nix/var/nix/profiles/system")103 .arg("--list-generations");104 // Sudo is required due to --list-generations acquiring lock on the profile.105 let data = cmd.sudo().run_string().await?;106 let generations = data107 .split('\n')108 .map(|e| e.trim())109 .filter(|&l| !l.is_empty())110 .filter_map(|g| {111 let gen = parse_generation_line(g);112 if gen.is_none() {113 warn!("bad generation: {g}");114 }115 gen116 })117 .collect::<Vec<_>>();118 let current = generations119 .into_iter()120 .filter(|g| g.current)121 .at_most_one()122 .map_err(|_e| anyhow!("bad list-generations output"))?123 .ok_or_else(|| anyhow!("failed to find generation"))?;124 Ok(current)125}126127async fn deploy_task(128 action: DeployAction,129 host: &ConfigHost,130 built: PathBuf,131 specialisation: Option<String>,132 disable_rollback: bool,133) -> Result<()> {134 let deploy_kind = host.deploy_kind().await?;135 if (deploy_kind == DeployKind::NixosInstall || deploy_kind == DeployKind::NixosLustrate)136 && !matches!(action, DeployAction::Boot | DeployAction::Upload)137 {138 bail!("{deploy_kind:?} deploy kind only supports boot and upload actions");139 }140141 let mut failed = false;142143 // TODO: Lockfile, to prevent concurrent system switch?144 // TODO: If rollback target exists - bail, it should be removed. Lockfile will not work in case if rollback145 // is scheduler on next boot (default behavior). On current boot - rollback activator will fail due to146 // unit name conflict in systemd-run147 // This code is tied to rollback.nix148 if !disable_rollback && action.should_create_rollback_marker() {149 let _span = info_span!("preparing").entered();150 info!("preparing for rollback");151 let generation = get_current_generation(host).await?;152 info!(153 "rollback target would be {} {}",154 generation.id, generation.datetime155 );156 {157 let mut cmd = host.cmd("sh").await?;158 cmd.arg("-c").arg(format!("mark=$(mktemp -p /etc -t fleet_rollback_marker.XXXXX) && echo -n {} > $mark && mv --no-clobber $mark /etc/fleet_rollback_marker", generation.id));159 if let Err(e) = cmd.sudo().run().await {160 error!("failed to set rollback marker: {e}");161 failed = true;162 }163 }164 // Activation script also starts rollback-watchdog.timer, however, it is possible that it won't be started.165 // Kicking it on manually will work best.166 //167 // There wouldn't be conflict, because here we trigger start of the primary service, and systemd will168 // only allow one instance of it.169170 // TODO: We should also watch how this process is going.171 // After running this command, we have less than 3 minutes to deploy everything,172 // if we fail to perform generation switch in time, then we will still call the activation script, and this may break something.173 // Anyway, reboot will still help in this case.174 if action.should_schedule_rollback_run() {175 let mut cmd = host.cmd("systemd-run").await?;176 cmd.comparg("--on-active", "3min")177 .comparg("--unit", "rollback-watchdog-run")178 .arg("systemctl")179 .arg("start")180 .arg("rollback-watchdog.service");181 if let Err(e) = cmd.sudo().run().await {182 error!("failed to schedule rollback run: {e}");183 failed = true;184 }185 }186 }187 if deploy_kind == DeployKind::NixosLustrate {188 // Fleet could also create this file, but as this operation is potentially disruptive,189 // make user do it themself.190 if !host.file_exists("/etc/NIXOS_LUSTRATE").await? {191 bail!("/etc/NIXOS_LUSTRATE should be created on remote host");192 }193 // Wanted by NixOS to recognize the system as NixOS.194 let mut cmd = host.cmd("touch").await?;195 cmd.arg("/etc/NIXOS");196 cmd.sudo().run().await.context("creating /etc/NIXOS")?;197 }198 if deploy_kind == DeployKind::NixosInstall {199 info!(200 "running nixos-install to switch profile, install bootloader, and perform activation"201 );202 let mut cmd = host.cmd("nixos-install").await?;203 cmd.arg("--system").arg(&built).args([204 // Channels here aren't fleet host system channels, but channels embedded in installation cd, which might be old.205 // It is possible to copy host channels, but I would prefer non-flake nix just to be unsupported.206 "--no-channel-copy",207 "--root",208 "/mnt",209 ]);210 if let Err(e) = cmd.sudo().run().await {211 error!("failed to execute nixos-install: {e}");212 failed = true;213 }214 } else {215 if action.should_switch_profile() && !failed {216 info!("switching system profile generation");217218 // To avoid even more problems, using nixos-install for now.219 // // nix build is unable to work with --store argument for some reason, and nix until 2.26 didn't support copy with --profile argument,220 // // falling back to using nix-env command221 // // After stable NixOS starts using 2.26 - use `nix --store /mnt copy --from /mnt --profile ...` here, and instead of nix build below.222 // let mut cmd = host.cmd("nix-env").await?;223 // cmd.args([224 // "--store",225 // "/mnt",226 // "--profile",227 // "/mnt/nix/var/nix/profiles/system",228 // "--set",229 // ])230 // .arg(&built);231 // if let Err(e) = cmd.sudo().run_nix().await {232 // error!("failed to switch system profile generation: {e}");233 // failed = true;234 // }235 // It would also be possible to update profile atomically during copy:236 // https://github.com/NixOS/nix/pull/11657237 let mut cmd = host.nix_cmd().await?;238 cmd.arg("build");239 cmd.comparg("--profile", "/nix/var/nix/profiles/system");240 cmd.arg(&built);241 if let Err(e) = cmd.sudo().run_nix().await {242 error!("failed to switch system profile generation: {e}");243 failed = true;244 }245 }246247 // FIXME: Connection might be disconnected after activation run248249 if action.should_activate() && !failed {250 let _span = info_span!("activating").entered();251 info!("executing activation script");252 let specialised = if let Some(specialisation) = specialisation {253 let mut specialised = built.join("specialisation");254 specialised.push(specialisation);255 specialised256 } else {257 built.clone()258 };259 let switch_script = specialised.join("bin/switch-to-configuration");260 let mut cmd = host.cmd(switch_script).in_current_span().await?;261 if deploy_kind == DeployKind::NixosLustrate {262 cmd.env("NIXOS_INSTALL_BOOTLOADER", "1");263 }264 cmd.env("FLEET_ONLINE_ACTIVATION", "1")265 .arg(action.name().expect("upload.should_activate == false"));266 if let Err(e) = cmd.sudo().run().in_current_span().await {267 error!("failed to activate: {e}");268 failed = true;269 }270 }271 }272 if action.should_create_rollback_marker() {273 if !disable_rollback {274 if failed {275 if action.should_schedule_rollback_run() {276 info!("executing rollback");277 if let Err(e) = host278 .systemctl_start("rollback-watchdog.service")279 .instrument(info_span!("rollback"))280 .await281 {282 error!("failed to trigger rollback: {e}")283 }284 }285 } else {286 info!("trying to mark upgrade as successful");287 if let Err(e) = host288 .rm_file("/etc/fleet_rollback_marker", true)289 .in_current_span()290 .await291 {292 error!("failed to remove rollback marker. This is bad, as the system will be rolled back by watchdog: {e}")293 }294 }295 info!("disarming watchdog, just in case");296 if let Err(_e) = host.systemctl_stop("rollback-watchdog.timer").await {297 // It is ok, if there was no reboot - then timer might not be running.298 }299 if action.should_schedule_rollback_run() {300 if let Err(e) = host.systemctl_stop("rollback-watchdog-run.timer").await {301 error!("failed to disarm rollback run: {e}");302 }303 }304 } else if let Err(_e) = host305 .rm_file("/etc/fleet_rollback_marker", true)306 .in_current_span()307 .await308 {309 // Marker might not exist, yet better try to remove it.310 }311 }312 Ok(())313}314315async fn build_task(316 config: Config,317 hostname: String,318 build_attr: &str,319 batch: Option<NixBuildBatch>,320) -> Result<PathBuf> {321 info!("building");322 let host = config.host(&hostname).await?;323 // let action = Action::from(self.subcommand.clone());324 let nixos = host.nixos_config().await?;325 let drv = nix_go!(nixos.system.build[{ build_attr }]);326 let outputs = drv.build_maybe_batch(batch).await?;327 let out_output = outputs328 .get("out")329 .ok_or_else(|| anyhow!("system build should produce \"out\" output"))?;330331 {332 info!("adding gc root");333 let mut cmd = config.local_host().cmd("nix").await?;334 cmd.arg("build")335 .comparg(336 "--profile",337 format!(338 "/nix/var/nix/profiles/{}-{hostname}",339 config.data().gc_root_prefix340 ),341 )342 .arg(out_output);343 cmd.sudo().run_nix().await?;344 }345346 Ok(out_output.clone())347}348349impl BuildSystems {350 pub async fn run(self, config: &Config, opts: &FleetOpts) -> Result<()> {351 let hosts = opts.filter_skipped(config.list_hosts().await?).await?;352 let set = LocalSet::new();353 let build_attr = self.build_attr.clone();354 let batch = (hosts.len() > 1).then(|| {355 config356 .nix_session357 .new_build_batch("build-hosts".to_string())358 });359 for host in hosts {360 let config = config.clone();361 let span = info_span!("build", host = field::display(&host.name));362 let hostname = host.name;363 let build_attr = build_attr.clone();364 let batch = batch.clone();365 set.spawn_local(366 (async move {367 let built = match build_task(config, hostname.clone(), &build_attr, batch).await368 {369 Ok(path) => path,370 Err(e) => {371 error!("failed to deploy host: {}", e);372 return;373 }374 };375 // TODO: Handle error376 let mut out = current_dir().expect("cwd exists");377 out.push(format!("built-{}", hostname));378379 info!("linking iso image to {:?}", out);380 if let Err(e) = symlink(built, out) {381 error!("failed to symlink: {e}")382 }383 })384 .instrument(span),385 );386 }387 drop(batch);388 set.await;389 Ok(())390 }391}392393impl Deploy {394 pub async fn run(self, config: &Config, opts: &FleetOpts) -> Result<()> {395 let hosts = opts.filter_skipped(config.list_hosts().await?).await?;396 let set = LocalSet::new();397 let batch = (hosts.len() > 1).then(|| {398 config399 .nix_session400 .new_build_batch("deploy-hosts".to_string())401 });402 for host in hosts.into_iter() {403 let config = config.clone();404 let span = info_span!("deploy", host = field::display(&host.name));405 let hostname = host.name.clone();406 let local_host = config.local_host();407 let opts = opts.clone();408 let batch = batch.clone();409 if let Some(deploy_kind) = opts.action_attr::<DeployKind>(&host, "deploy_kind").await? {410 host.set_deploy_kind(deploy_kind);411 };412413 set.spawn_local(414 (async move {415 let built =416 match build_task(config.clone(), hostname.clone(), "toplevel", batch).await417 {418 Ok(path) => path,419 Err(e) => {420 error!("failed to build host system closure: {}", e);421 return;422 }423 };424425 let deploy_kind = match host.deploy_kind().await {426 Ok(v) => v,427 Err(e) => {428 error!("failed to query target deploy kind: {e}");429 return;430 }431 };432433 // TODO: Make disable_rollback a host attribute instead434 let mut disable_rollback = self.disable_rollback;435 if !disable_rollback && deploy_kind != DeployKind::Fleet {436 warn!("disabling rollback, as not supported by non-fleet deployment kinds");437 disable_rollback = true;438 }439440 if !opts.is_local(&hostname) {441 info!("uploading system closure");442 {443 // TODO: Move to remote_derivation method.444 // Alternatively, nix store make-content-addressed can be used,445 // at least for the first deployment, to provide trusted store key.446 //447 // It is much slower, yet doesn't require root on the deployer machine.448 let Ok(mut sign) = local_host.cmd("nix").await else {449 error!("failed to setup local");450 return;451 };452 // Private key for host machine is registered in nix-sign.nix453 sign.arg("store")454 .arg("sign")455 .comparg("--key-file", "/etc/nix/private-key")456 .arg("-r")457 .arg(&built);458 if let Err(e) = sign.sudo().run_nix().await {459 warn!("failed to sign store paths: {e}");460 };461 }462 let mut tries = 0;463 loop {464 match host.remote_derivation(&built).await {465 Ok(remote) => {466 assert!(remote == built, "CA derivations aren't implemented");467 break;468 }469 Err(e) if tries < 3 => {470 tries += 1;471 warn!("copy failure ({}/3): {}", tries, e);472 sleep(Duration::from_millis(5000)).await;473 }474 Err(e) => {475 error!("upload failed: {e}");476 return;477 }478 }479 }480 }481 if let Err(e) = deploy_task(482 self.action,483 &host,484 built,485 if let Ok(v) = opts.action_attr(&host, "specialisation").await {486 v487 } else {488 error!("unreachable? failed to get specialization");489 return;490 },491 disable_rollback,492 )493 .await494 {495 error!("activation failed: {e}");496 }497 })498 .instrument(span),499 );500 }501 drop(batch);502 set.await;503 Ok(())504 }505}crates/fleet-base/src/host.rsdiffbeforeafterboth--- a/crates/fleet-base/src/host.rs
+++ b/crates/fleet-base/src/host.rs
@@ -23,8 +23,10 @@
};
pub struct FleetConfigInternals {
+ /// Fleet project directory, containing fleet.nix file.
+ pub directory: PathBuf,
+ /// builtins.currentSystem
pub local_system: String,
- pub directory: PathBuf,
pub data: Mutex<FleetData>,
pub nix_args: Vec<OsString>,
/// fleet_config.config
@@ -34,6 +36,7 @@
/// import nixpkgs {system = local};
pub default_pkgs: Value,
+ /// inputs.nixpkgs
pub nixpkgs: Value,
pub nix_session: NixSession,
@@ -58,7 +61,7 @@
Su,
}
-#[derive(Clone, PartialEq, Copy)]
+#[derive(Clone, PartialEq, Copy, Debug)]
pub enum DeployKind {
/// NixOS => NixOS managed by fleet
UpgradeToFleet,
@@ -67,6 +70,10 @@
/// Remote host has /mnt, /mnt/boot mounted,
/// generated config is added to fleet configuration.
NixosInstall,
+ /// Remote host has some system and nix installed in multi-user mode (/nix is owned by root),
+ /// generated config is added to fleet configuration,
+ /// and /etc/NIXOS_LUSTRATE exists, fleet will perform the rest.
+ NixosLustrate,
}
impl FromStr for DeployKind {
@@ -302,7 +309,7 @@
nix.arg("copy").arg("--substitute-on-destination");
match self.deploy_kind().await? {
- DeployKind::Fleet | DeployKind::UpgradeToFleet => {
+ DeployKind::Fleet | DeployKind::UpgradeToFleet | DeployKind::NixosLustrate => {
nix.comparg("--to", format!("ssh-ng://{}", self.name));
}
DeployKind::NixosInstall => {
crates/fleet-base/src/opts.rsdiffbeforeafterboth--- a/crates/fleet-base/src/opts.rs
+++ b/crates/fleet-base/src/opts.rs
@@ -6,7 +6,7 @@
sync::{Arc, Mutex},
};
-use anyhow::{Context, Result};
+use anyhow::{bail, Context, Result};
use clap::Parser;
use nix_eval::{nix_go, util::assert_warn, NixSessionPool, Value};
use nom::{
@@ -182,7 +182,23 @@
// TODO: Config should be detached from opts.
pub async fn build(&self, nix_args: Vec<OsString>, assert: bool) -> Result<Config> {
- let directory = current_dir()?;
+ let cwd = current_dir()?;
+ let mut directory = cwd.clone();
+ let mut fleet_data_path = directory.join("fleet.nix");
+ while !fleet_data_path.is_file() {
+ // fleet.nix
+ fleet_data_path.pop();
+ if !directory.pop() || !fleet_data_path.pop() {
+ bail!(
+ "fleet.nix not found at {} or any of the parent directories",
+ cwd.display()
+ );
+ }
+ fleet_data_path.push("fleet.nix");
+ }
+ let bytes =
+ std::fs::read_to_string(&fleet_data_path).context("reading fleet state (fleet.nix)")?;
+ let data: Mutex<FleetData> = nixlike::parse_str(&bytes)?;
let pool = NixSessionPool::new(
directory.as_os_str().to_owned(),
@@ -193,12 +209,6 @@
let nix_session = pool.get().await?;
let builtins_field = Value::binding(nix_session.clone(), "builtins").await?;
-
- let mut fleet_data_path = directory.clone();
- fleet_data_path.push("fleet.nix");
- let bytes =
- std::fs::read_to_string(fleet_data_path).context("reading fleet state (fleet.nix)")?;
- let data: Mutex<FleetData> = nixlike::parse_str(&bytes)?;
let fleet_root = Value::binding(nix_session.clone(), "fleetConfigurations").await?;
let fleet_field = nix_go!(fleet_root.default({ data }));