difftreelog
feat sign in remote_derivation
in: trunk
3 files changed
crates/fleet-base/src/deploy.rsdiffbeforeafterboth1use std::{path::PathBuf, time::Duration};23use anyhow::{Context as _, Result, anyhow, bail};4use clap::ValueEnum;5use itertools::Itertools;6use tokio::time::sleep;7use tracing::{Instrument as _, error, info, info_span, warn};89use crate::host::{Config, ConfigHost, DeployKind, Generation, GenerationStorage};1011#[derive(ValueEnum, Clone, Copy)]12pub enum DeployAction {13 /// Upload derivation, but do not execute the update.14 Upload,15 /// Upload and execute the activation script, old version will be used after reboot.16 Test,17 /// Upload and set as current system profile, but do not execute activation script.18 Boot,19 /// Upload, set current profile, and execute activation script.20 Switch,21}2223impl DeployAction {24 pub(crate) fn name(&self) -> Option<&'static str> {25 match self {26 Self::Upload => None,27 Self::Test => Some("test"),28 Self::Boot => Some("boot"),29 Self::Switch => Some("switch"),30 }31 }32 pub(crate) fn should_switch_profile(&self) -> bool {33 matches!(self, Self::Switch | Self::Boot)34 }35 pub(crate) fn should_activate(&self) -> bool {36 matches!(self, Self::Switch | Self::Test | Self::Boot)37 }38 pub(crate) fn should_create_rollback_marker(&self) -> bool {39 // Upload does nothing on the target machine, other than uploading the closure.40 // In boot case we want to have rollback marker prepared, so that the system may rollback itself on the next boot.41 !matches!(self, Self::Upload)42 }43 pub(crate) fn should_schedule_rollback_run(&self) -> bool {44 matches!(self, Self::Switch | Self::Test)45 }46}4748async fn get_current_generation(host: &ConfigHost) -> Result<Generation> {49 let generations = host.list_generations("system").await?;50 let current = generations51 .into_iter()52 .filter(|g| g.current)53 .at_most_one()54 .map_err(|_e| anyhow!("bad list-generations output"))?55 .ok_or_else(|| anyhow!("failed to find generation"))?;56 Ok(current)57}5859pub async fn deploy_task(60 action: DeployAction,61 host: &ConfigHost,62 built: PathBuf,63 specialisation: Option<String>,64 disable_rollback: bool,65) -> Result<()> {66 let deploy_kind = host.deploy_kind().await?;67 if (deploy_kind == DeployKind::NixosInstall || deploy_kind == DeployKind::NixosLustrate)68 && !matches!(action, DeployAction::Boot | DeployAction::Upload)69 {70 bail!("{deploy_kind:?} deploy kind only supports boot and upload actions");71 }7273 let mut failed = false;7475 // TODO: Lockfile, to prevent concurrent system switch?76 // TODO: If rollback target exists - bail, it should be removed. Lockfile will not work in case if rollback77 // is scheduler on next boot (default behavior). On current boot - rollback activator will fail due to78 // unit name conflict in systemd-run79 // This code is tied to rollback.nix80 if !disable_rollback && action.should_create_rollback_marker() {81 // let _span = info_span!("preparing").entered();82 info!("preparing for rollback");83 let generation = get_current_generation(host).await?;84 info!(85 "rollback target would be {} {}",86 generation.id, generation.datetime87 );88 {89 let mut cmd = host.cmd("sh").await?;90 cmd.arg("-c").arg(format!("mark=$(mktemp -p /etc -t fleet_rollback_marker.XXXXX) && echo -n {} > $mark && mv --no-clobber $mark /etc/fleet_rollback_marker", generation.id));91 if let Err(e) = cmd.sudo().run().await {92 error!("failed to set rollback marker: {e}");93 failed = true;94 }95 }96 // Activation script also starts rollback-watchdog.timer, however, it is possible that it won't be started.97 // Kicking it on manually will work best.98 //99 // There wouldn't be conflict, because here we trigger start of the primary service, and systemd will100 // only allow one instance of it.101102 // TODO: We should also watch how this process is going.103 // After running this command, we have less than 3 minutes to deploy everything,104 // if we fail to perform generation switch in time, then we will still call the activation script, and this may break something.105 // Anyway, reboot will still help in this case.106 if action.should_schedule_rollback_run() {107 let mut cmd = host.cmd("systemd-run").await?;108 cmd.comparg("--on-active", "3min")109 .comparg("--unit", "rollback-watchdog-run")110 .arg("systemctl")111 .arg("start")112 .arg("rollback-watchdog.service");113 if let Err(e) = cmd.sudo().run().await {114 error!("failed to schedule rollback run: {e}");115 failed = true;116 }117 }118 }119 if deploy_kind == DeployKind::NixosLustrate {120 // Fleet could also create this file, but as this operation is potentially disruptive,121 // make user do it themself.122 if !host.file_exists("/etc/NIXOS_LUSTRATE").await? {123 bail!("/etc/NIXOS_LUSTRATE should be created on remote host");124 }125 // Wanted by NixOS to recognize the system as NixOS.126 let mut cmd = host.cmd("touch").await?;127 cmd.arg("/etc/NIXOS");128 cmd.sudo().run().await.context("creating /etc/NIXOS")?;129 }130 if deploy_kind == DeployKind::NixosInstall {131 info!(132 "running nixos-install to switch profile, install bootloader, and perform activation"133 );134 let mut cmd = host.cmd("nixos-install").await?;135 cmd.arg("--system").arg(&built).args([136 // Channels here aren't fleet host system channels, but channels embedded in installation cd, which might be old.137 // It is possible to copy host channels, but I would prefer non-flake nix just to be unsupported.138 "--no-channel-copy",139 "--root",140 "/mnt",141 ]);142 if let Err(e) = cmd.sudo().run().await {143 error!("failed to execute nixos-install: {e}");144 failed = true;145 }146 } else {147 if action.should_switch_profile() && !failed {148 info!("switching system profile generation");149150 // To avoid even more problems, using nixos-install for now.151 // // nix build is unable to work with --store argument for some reason, and nix until 2.26 didn't support copy with --profile argument,152 // // falling back to using nix-env command153 // // After stable NixOS starts using 2.26 - use `nix --store /mnt copy --from /mnt --profile ...` here, and instead of nix build below.154 // let mut cmd = host.cmd("nix-env").await?;155 // cmd.args([156 // "--store",157 // "/mnt",158 // "--profile",159 // "/mnt/nix/var/nix/profiles/system",160 // "--set",161 // ])162 // .arg(&built);163 // if let Err(e) = cmd.sudo().run_nix().await {164 // error!("failed to switch system profile generation: {e}");165 // failed = true;166 // }167 // It would also be possible to update profile atomically during copy:168 // https://github.com/NixOS/nix/pull/11657169 let mut cmd = host.nix_cmd().await?;170 cmd.arg("build");171 cmd.comparg("--profile", "/nix/var/nix/profiles/system");172 cmd.arg(&built);173 if let Err(e) = cmd.sudo().run_nix().await {174 error!("failed to switch system profile generation: {e}");175 failed = true;176 }177 }178179 // FIXME: Connection might be disconnected after activation run180181 if action.should_activate() && !failed {182 // let _span = info_span!("activating").entered();183 info!("executing activation script");184 let specialised = if let Some(specialisation) = specialisation {185 let mut specialised = built.join("specialisation");186 specialised.push(specialisation);187 specialised188 } else {189 built.clone()190 };191 let switch_script = specialised.join("bin/switch-to-configuration");192 let mut cmd = host.cmd("systemd-run").in_current_span().await?;193 cmd.arg("--collect")194 .arg("--no-ask-password")195 .arg("--pipe")196 .arg("--quiet")197 .arg("--service-type=exec")198 .arg("--unit=fleet-switch-to-configuration")199 .arg(switch_script);200 if deploy_kind == DeployKind::NixosLustrate {201 cmd.env("NIXOS_INSTALL_BOOTLOADER", "1");202 }203 cmd.env("FLEET_ONLINE_ACTIVATION", "1")204 .arg(action.name().expect("upload.should_activate == false"));205 if let Err(e) = cmd.sudo().run().in_current_span().await {206 error!("failed to activate: {e}");207 failed = true;208 }209 }210 }211 if action.should_create_rollback_marker() {212 if !disable_rollback {213 if failed {214 if action.should_schedule_rollback_run() {215 info!("executing rollback");216 if let Err(e) = host217 .systemctl_start("rollback-watchdog.service")218 .instrument(info_span!("rollback"))219 .await220 {221 error!("failed to trigger rollback: {e}")222 }223 }224 } else {225 info!("trying to mark upgrade as successful");226 if let Err(e) = host227 .rm_file("/etc/fleet_rollback_marker", true)228 .in_current_span()229 .await230 {231 error!(232 "failed to remove rollback marker. This is bad, as the system will be rolled back by watchdog: {e}"233 )234 }235 }236 info!("disarming watchdog, just in case");237 if let Err(_e) = host.systemctl_stop("rollback-watchdog.timer").await {238 // It is ok, if there was no reboot - then timer might not be running.239 }240 if action.should_schedule_rollback_run() {241 if let Err(e) = host.systemctl_stop("rollback-watchdog-run.timer").await {242 error!("failed to disarm rollback run: {e}");243 }244 }245 } else if let Err(_e) = host246 .rm_file("/etc/fleet_rollback_marker", true)247 .in_current_span()248 .await249 {250 // Marker might not exist, yet better try to remove it.251 }252 }253 Ok(())254}255256pub async fn upload_task(257 config: &Config,258 host: &ConfigHost,259 location: GenerationStorage,260 generation: PathBuf,261) -> Result<PathBuf> {262 let local_host = config.local_host();263 if matches!(location, GenerationStorage::Pusher) {264 bail!("pusher is not enabled in this version of fleet");265 }266 if !host.local {267 info!("uploading system closure");268 {269 // TODO: Move to remote_derivation method.270 // Alternatively, nix store make-content-addressed can be used,271 // at least for the first deployment, to provide trusted store key.272 //273 // It is much slower, yet doesn't require root on the deployer machine.274 let Ok(mut sign) = local_host.cmd("nix").await else {275 bail!("failed to setup local");276 };277 // Private key for host machine is registered in nix-sign.nix278 sign.arg("store")279 .arg("sign")280 .comparg("--key-file", "/etc/nix/private-key")281 .arg("-r")282 .arg(&generation);283 if let Err(e) = sign.sudo().run_nix().await {284 warn!("failed to sign store paths: {e}");285 };286 }287 let mut tries = 0;288 loop {289 match host.remote_derivation(&generation).await {290 Ok(remote) => {291 assert!(remote == generation, "CA derivations aren't implemented");292 return Ok(remote);293 }294 Err(e) if tries < 3 => {295 tries += 1;296 warn!("copy failure ({}/3): {}", tries, e);297 sleep(Duration::from_millis(5000)).await;298 }299 Err(e) => {300 bail!("upload failed: {e}");301 }302 }303 }304 }305 Ok(generation)306}1use std::{path::PathBuf, time::Duration};23use anyhow::{Context as _, Result, anyhow, bail};4use clap::ValueEnum;5use itertools::Itertools;6use tokio::time::sleep;7use tracing::{Instrument as _, error, info, info_span, warn};89use crate::host::{Config, ConfigHost, DeployKind, Generation, GenerationStorage};1011#[derive(ValueEnum, Clone, Copy)]12pub enum DeployAction {13 /// Upload derivation, but do not execute the update.14 Upload,15 /// Upload and execute the activation script, old version will be used after reboot.16 Test,17 /// Upload and set as current system profile, but do not execute activation script.18 Boot,19 /// Upload, set current profile, and execute activation script.20 Switch,21}2223impl DeployAction {24 pub(crate) fn name(&self) -> Option<&'static str> {25 match self {26 Self::Upload => None,27 Self::Test => Some("test"),28 Self::Boot => Some("boot"),29 Self::Switch => Some("switch"),30 }31 }32 pub(crate) fn should_switch_profile(&self) -> bool {33 matches!(self, Self::Switch | Self::Boot)34 }35 pub(crate) fn should_activate(&self) -> bool {36 matches!(self, Self::Switch | Self::Test | Self::Boot)37 }38 pub(crate) fn should_create_rollback_marker(&self) -> bool {39 // Upload does nothing on the target machine, other than uploading the closure.40 // In boot case we want to have rollback marker prepared, so that the system may rollback itself on the next boot.41 !matches!(self, Self::Upload)42 }43 pub(crate) fn should_schedule_rollback_run(&self) -> bool {44 matches!(self, Self::Switch | Self::Test)45 }46}4748async fn get_current_generation(host: &ConfigHost) -> Result<Generation> {49 let generations = host.list_generations("system").await?;50 let current = generations51 .into_iter()52 .filter(|g| g.current)53 .at_most_one()54 .map_err(|_e| anyhow!("bad list-generations output"))?55 .ok_or_else(|| anyhow!("failed to find generation"))?;56 Ok(current)57}5859pub async fn deploy_task(60 action: DeployAction,61 host: &ConfigHost,62 built: PathBuf,63 specialisation: Option<String>,64 disable_rollback: bool,65) -> Result<()> {66 let deploy_kind = host.deploy_kind().await?;67 if (deploy_kind == DeployKind::NixosInstall || deploy_kind == DeployKind::NixosLustrate)68 && !matches!(action, DeployAction::Boot | DeployAction::Upload)69 {70 bail!("{deploy_kind:?} deploy kind only supports boot and upload actions");71 }7273 let mut failed = false;7475 // TODO: Lockfile, to prevent concurrent system switch?76 // TODO: If rollback target exists - bail, it should be removed. Lockfile will not work in case if rollback77 // is scheduler on next boot (default behavior). On current boot - rollback activator will fail due to78 // unit name conflict in systemd-run79 // This code is tied to rollback.nix80 if !disable_rollback && action.should_create_rollback_marker() {81 // let _span = info_span!("preparing").entered();82 info!("preparing for rollback");83 let generation = get_current_generation(host).await?;84 info!(85 "rollback target would be {} {}",86 generation.id, generation.datetime87 );88 {89 let mut cmd = host.cmd("sh").await?;90 cmd.arg("-c").arg(format!("mark=$(mktemp -p /etc -t fleet_rollback_marker.XXXXX) && echo -n {} > $mark && mv --no-clobber $mark /etc/fleet_rollback_marker", generation.id));91 if let Err(e) = cmd.sudo().run().await {92 error!("failed to set rollback marker: {e}");93 failed = true;94 }95 }96 // Activation script also starts rollback-watchdog.timer, however, it is possible that it won't be started.97 // Kicking it on manually will work best.98 //99 // There wouldn't be conflict, because here we trigger start of the primary service, and systemd will100 // only allow one instance of it.101102 // TODO: We should also watch how this process is going.103 // After running this command, we have less than 3 minutes to deploy everything,104 // if we fail to perform generation switch in time, then we will still call the activation script, and this may break something.105 // Anyway, reboot will still help in this case.106 if action.should_schedule_rollback_run() {107 let mut cmd = host.cmd("systemd-run").await?;108 cmd.comparg("--on-active", "3min")109 .comparg("--unit", "rollback-watchdog-run")110 .arg("systemctl")111 .arg("start")112 .arg("rollback-watchdog.service");113 if let Err(e) = cmd.sudo().run().await {114 error!("failed to schedule rollback run: {e}");115 failed = true;116 }117 }118 }119 if deploy_kind == DeployKind::NixosLustrate {120 // Fleet could also create this file, but as this operation is potentially disruptive,121 // make user do it themself.122 if !host.file_exists("/etc/NIXOS_LUSTRATE").await? {123 bail!("/etc/NIXOS_LUSTRATE should be created on remote host");124 }125 // Wanted by NixOS to recognize the system as NixOS.126 let mut cmd = host.cmd("touch").await?;127 cmd.arg("/etc/NIXOS");128 cmd.sudo().run().await.context("creating /etc/NIXOS")?;129 }130 if deploy_kind == DeployKind::NixosInstall {131 info!(132 "running nixos-install to switch profile, install bootloader, and perform activation"133 );134 let mut cmd = host.cmd("nixos-install").await?;135 cmd.arg("--system").arg(&built).args([136 // Channels here aren't fleet host system channels, but channels embedded in installation cd, which might be old.137 // It is possible to copy host channels, but I would prefer non-flake nix just to be unsupported.138 "--no-channel-copy",139 "--root",140 "/mnt",141 ]);142 if let Err(e) = cmd.sudo().run().await {143 error!("failed to execute nixos-install: {e}");144 failed = true;145 }146 } else {147 if action.should_switch_profile() && !failed {148 info!("switching system profile generation");149150 // To avoid even more problems, using nixos-install for now.151 // // nix build is unable to work with --store argument for some reason, and nix until 2.26 didn't support copy with --profile argument,152 // // falling back to using nix-env command153 // // After stable NixOS starts using 2.26 - use `nix --store /mnt copy --from /mnt --profile ...` here, and instead of nix build below.154 // let mut cmd = host.cmd("nix-env").await?;155 // cmd.args([156 // "--store",157 // "/mnt",158 // "--profile",159 // "/mnt/nix/var/nix/profiles/system",160 // "--set",161 // ])162 // .arg(&built);163 // if let Err(e) = cmd.sudo().run_nix().await {164 // error!("failed to switch system profile generation: {e}");165 // failed = true;166 // }167 // It would also be possible to update profile atomically during copy:168 // https://github.com/NixOS/nix/pull/11657169 let mut cmd = host.nix_cmd().await?;170 cmd.arg("build");171 cmd.comparg("--profile", "/nix/var/nix/profiles/system");172 cmd.arg(&built);173 if let Err(e) = cmd.sudo().run_nix().await {174 error!("failed to switch system profile generation: {e}");175 failed = true;176 }177 }178179 // FIXME: Connection might be disconnected after activation run180181 if action.should_activate() && !failed {182 // let _span = info_span!("activating").entered();183 info!("executing activation script");184 let specialised = if let Some(specialisation) = specialisation {185 let mut specialised = built.join("specialisation");186 specialised.push(specialisation);187 specialised188 } else {189 built.clone()190 };191 let switch_script = specialised.join("bin/switch-to-configuration");192 let mut cmd = host.cmd("systemd-run").in_current_span().await?;193 cmd.arg("--collect")194 .arg("--no-ask-password")195 .arg("--pipe")196 .arg("--quiet")197 .arg("--service-type=exec")198 .arg("--unit=fleet-switch-to-configuration")199 .arg(switch_script);200 if deploy_kind == DeployKind::NixosLustrate {201 cmd.env("NIXOS_INSTALL_BOOTLOADER", "1");202 }203 cmd.env("FLEET_ONLINE_ACTIVATION", "1")204 .arg(action.name().expect("upload.should_activate == false"));205 if let Err(e) = cmd.sudo().run().in_current_span().await {206 error!("failed to activate: {e}");207 failed = true;208 }209 }210 }211 if action.should_create_rollback_marker() {212 if !disable_rollback {213 if failed {214 if action.should_schedule_rollback_run() {215 info!("executing rollback");216 if let Err(e) = host217 .systemctl_start("rollback-watchdog.service")218 .instrument(info_span!("rollback"))219 .await220 {221 error!("failed to trigger rollback: {e}")222 }223 }224 } else {225 info!("trying to mark upgrade as successful");226 if let Err(e) = host227 .rm_file("/etc/fleet_rollback_marker", true)228 .in_current_span()229 .await230 {231 error!(232 "failed to remove rollback marker. This is bad, as the system will be rolled back by watchdog: {e}"233 )234 }235 }236 info!("disarming watchdog, just in case");237 if let Err(_e) = host.systemctl_stop("rollback-watchdog.timer").await {238 // It is ok, if there was no reboot - then timer might not be running.239 }240 if action.should_schedule_rollback_run() {241 if let Err(e) = host.systemctl_stop("rollback-watchdog-run.timer").await {242 error!("failed to disarm rollback run: {e}");243 }244 }245 } else if let Err(_e) = host246 .rm_file("/etc/fleet_rollback_marker", true)247 .in_current_span()248 .await249 {250 // Marker might not exist, yet better try to remove it.251 }252 }253 Ok(())254}255256pub async fn upload_task(257 config: &Config,258 host: &ConfigHost,259 location: GenerationStorage,260 generation: PathBuf,261) -> Result<PathBuf> {262 if matches!(location, GenerationStorage::Pusher) {263 bail!("pusher is not enabled in this version of fleet");264 }265 if !host.local {266 info!("uploading system closure");267 let mut tries = 0;268 loop {269 match host.remote_derivation(&generation).await {270 Ok(remote) => {271 assert!(remote == generation, "CA derivations aren't implemented");272 return Ok(remote);273 }274 Err(e) if tries < 3 => {275 tries += 1;276 warn!("copy failure ({}/3): {}", tries, e);277 sleep(Duration::from_millis(5000)).await;278 }279 Err(e) => {280 bail!("upload failed: {e}");281 }282 }283 }284 }285 Ok(generation)286}crates/fleet-base/src/host.rsdiffbeforeafterboth--- a/crates/fleet-base/src/host.rs
+++ b/crates/fleet-base/src/host.rs
@@ -456,6 +456,20 @@
// Path is located locally, thus already trusted.
return Ok(path.to_owned());
}
+ let mut sign = MyCommand::new(
+ // TODO: Look at the current escalation strategy.
+ // ... or switch to run0 right after polkit update
+ EscalationStrategy::Sudo,
+ "nix",
+ );
+ sign.arg("store")
+ .arg("sign")
+ .comparg("--key-file", "/etc/nix/private-key")
+ .arg("-r")
+ .arg(&path);
+ if let Err(e) = sign.sudo().run_nix().await {
+ warn!("failed to sign store paths: {e}");
+ }
let mut nix = MyCommand::new(
// Not used
EscalationStrategy::Su,
crates/nix-eval/src/lib.rsdiffbeforeafterboth--- a/crates/nix-eval/src/lib.rs
+++ b/crates/nix-eval/src/lib.rs
@@ -308,7 +308,6 @@
}
static GLOBAL_STATE: LazyLock<GlobalState> = LazyLock::new(|| {
- info!("initializing nix global state");
GlobalState::new().expect("global state init shouldn't fail")
});