difftreelog
fix do not prepare for rollback on upload
in: trunk
1 file changed
cmds/fleet/src/cmds/build_systems.rsdiffbeforeafterboth1use std::os::unix::fs::symlink;2use std::path::PathBuf;3use std::{env::current_dir, time::Duration};45use crate::command::MyCommand;6use crate::host::{Config, ConfigHost};7use crate::nix_go;8use anyhow::{anyhow, Result};9use clap::{Parser, ValueEnum};10use itertools::Itertools as _;11use tokio::{task::LocalSet, time::sleep};12use tracing::{error, field, info, info_span, warn, Instrument};1314#[derive(Parser)]15pub struct Deploy {16 /// Disable automatic rollback17 #[clap(long)]18 disable_rollback: bool,19 action: DeployAction,20}2122#[derive(ValueEnum, Clone, Copy)]23enum DeployAction {24 /// Upload derivation, but do not execute the update.25 Upload,26 /// Upload and execute the activation script, old version will be used after reboot.27 Test,28 /// Upload and set as current system profile, but do not execute activation script.29 Boot,30 /// Upload, set current profile, and execute activation script.31 Switch,32}3334impl DeployAction {35 pub(crate) fn name(&self) -> Option<&'static str> {36 match self {37 DeployAction::Upload => None,38 DeployAction::Test => Some("test"),39 DeployAction::Boot => Some("boot"),40 DeployAction::Switch => Some("switch"),41 }42 }43 pub(crate) fn should_switch_profile(&self) -> bool {44 matches!(self, Self::Switch | Self::Boot)45 }46 pub(crate) fn should_activate(&self) -> bool {47 matches!(self, Self::Switch | Self::Test)48 }49 pub(crate) fn should_schedule_rollback_run(&self) -> bool {50 matches!(self, Self::Switch | Self::Test)51 }52}5354#[derive(Parser, Clone)]55pub struct BuildSystems {56 /// Attribute to build. Systems are deployed from "toplevel" attr, well-known used attributes57 /// are "sdImage"/"isoImage", and your configuration may include any other build attributes.58 #[clap(long, default_value = "toplevel")]59 build_attr: String,60}6162struct Generation {63 id: u32,64 current: bool,65 datetime: String,66}67async fn get_current_generation(host: &ConfigHost) -> Result<Generation> {68 let mut cmd = host.cmd("nix-env").await?;69 cmd.comparg("--profile", "/nix/var/nix/profiles/system")70 .arg("--list-generations");71 // Sudo is required due to --list-generations acquiring lock on the profile.72 let data = cmd.sudo().run_string().await?;73 let generations = data74 .split('\n')75 .map(|e| e.trim())76 .filter(|&l| !l.is_empty())77 .filter_map(|g| {78 let gen: Option<Generation> = try {79 let mut parts = g.split_whitespace();80 let id = parts.next()?;81 let id: u32 = id.parse().ok()?;82 let date = parts.next()?;83 let time = parts.next()?;84 let current = if let Some(current) = parts.next() {85 if current == "(current)" {86 Some(true)87 } else {88 None89 }90 } else {91 Some(false)92 };93 let current = current?;94 if parts.next().is_some() {95 warn!("unexpected text after generation: {g}");96 }97 Generation {98 id,99 current,100 datetime: format!("{date} {time}"),101 }102 };103 if gen.is_none() {104 warn!("bad generation: {g}")105 }106 gen107 })108 .collect::<Vec<_>>();109 let current = generations110 .into_iter()111 .filter(|g| g.current)112 .at_most_one()113 .map_err(|_e| anyhow!("bad list-generations output"))?114 .ok_or_else(|| anyhow!("failed to find generation"))?;115 Ok(current)116}117118async fn deploy_task(119 action: DeployAction,120 host: &ConfigHost,121 built: PathBuf,122 disable_rollback: bool,123) -> Result<()> {124 let mut failed = false;125 // TODO: Lockfile, to prevent concurrent system switch?126 // TODO: If rollback target exists - bail, it should be removed. Lockfile will not work in case if rollback127 // is scheduler on next boot (default behavior). On current boot - rollback activator will fail due to128 // unit name conflict in systemd-run129 // This code is tied to rollback.nix130 if !disable_rollback {131 let _span = info_span!("preparing").entered();132 info!("preparing for rollback");133 let generation = get_current_generation(host).await?;134 info!(135 "rollback target would be {} {}",136 generation.id, generation.datetime137 );138 {139 let mut cmd = host.cmd("sh").await?;140 cmd.arg("-c").arg(format!("mark=$(mktemp -p /etc -t fleet_rollback_marker.XXXXX) && echo -n {} > $mark && mv --no-clobber $mark /etc/fleet_rollback_marker", generation.id));141 if let Err(e) = cmd.sudo().run().await {142 error!("failed to set rollback marker: {e}");143 failed = true;144 }145 }146 // Activation script also starts rollback-watchdog.timer, however, it is possible that it won't be started.147 // Kicking it on manually will work best.148 //149 // There wouldn't be conflict, because here we trigger start of the primary service, and systemd will150 // only allow one instance of it.151152 // TODO: We should also watch how this process is going.153 // After running this command, we have less than 3 minutes to deploy everything,154 // if we fail to perform generation switch in time, then we will still call the activation script, and this may break something.155 // Anyway, reboot will still help in this case.156 if action.should_schedule_rollback_run() {157 let mut cmd = host.cmd("systemd-run").await?;158 cmd.comparg("--on-active", "3min")159 .comparg("--unit", "rollback-watchdog-run")160 .arg("systemctl")161 .arg("start")162 .arg("rollback-watchdog.service");163 if let Err(e) = cmd.sudo().run().await {164 error!("failed to schedule rollback run: {e}");165 failed = true;166 }167 }168 }169170 if action.should_switch_profile() && !failed {171 info!("switching generation");172 let mut cmd = host.cmd("nix-env").await?;173 cmd.comparg("--profile", "/nix/var/nix/profiles/system")174 .comparg("--set", &built);175 if let Err(e) = cmd.sudo().run().await {176 error!("failed to switch generation: {e}");177 failed = true;178 }179 }180181 // FIXME: Connection might be disconnected after activation run182183 if action.should_activate() && !failed {184 let _span = info_span!("activating").entered();185 info!("executing activation script");186 let mut switch_script = built.clone();187 switch_script.push("bin");188 switch_script.push("switch-to-configuration");189 let mut cmd = host.cmd(switch_script).in_current_span().await?;190 cmd.arg(action.name().expect("upload.should_activate == false"));191 if let Err(e) = cmd.sudo().run().in_current_span().await {192 error!("failed to activate: {e}");193 failed = true;194 }195 }196 if !disable_rollback {197 if failed {198 info!("executing rollback");199 if let Err(e) = host200 .systemctl_start("rollback-watchdog.service")201 .instrument(info_span!("rollback"))202 .await203 {204 error!("failed to trigger rollback: {e}")205 }206 } else {207 info!("trying to mark upgrade as successful");208 if let Err(e) = host209 .rm_file("/etc/fleet_rollback_marker", true)210 .in_current_span()211 .await212 {213 error!("failed to remove rollback marker. This is bad, as the system will be rolled back by watchdog: {e}")214 }215 }216 info!("disarming watchdog, just in case");217 if let Err(_e) = host.systemctl_stop("rollback-watchdog.timer").await {218 // It is ok, if there was no reboot - then timer might not be running.219 }220 if action.should_schedule_rollback_run() {221 if let Err(e) = host.systemctl_stop("rollback-watchdog-run.timer").await {222 error!("failed to disarm rollback run: {e}");223 }224 }225 } else if let Err(_e) = host226 .rm_file("/etc/fleet_rollback_marker", true)227 .in_current_span()228 .await229 {230 // Marker might not exist, yet better try to remove it.231 }232 Ok(())233}234235async fn build_task(config: Config, host: String, build_attr: &str) -> Result<PathBuf> {236 info!("building");237 let host = config.host(&host).await?;238 // let action = Action::from(self.subcommand.clone());239 let fleet_config = &config.config_field;240 let drv = nix_go!(241 fleet_config.hosts[{ &host.name }]242 .nixosSystem243 .config244 .system245 .build[{ build_attr }]246 );247 let outputs = drv.build().await.map_err(|e| {248 if build_attr == "sdImage" {249 info!("sd-image build failed");250 info!("Make sure you have imported modulesPath/installer/sd-card/sd-image-<arch>[-installer].nix (For installer, you may want to check config)");251 }252 e253 })?;254 let out_output = outputs255 .get("out")256 .ok_or_else(|| anyhow!("system build should produce \"out\" output"))?;257258 Ok(out_output.clone())259}260261impl BuildSystems {262 pub async fn run(self, config: &Config) -> Result<()> {263 let hosts = config.list_hosts().await?;264 let set = LocalSet::new();265 let build_attr = self.build_attr.clone();266 for host in hosts.into_iter() {267 if config.should_skip(&host.name) {268 continue;269 }270 let config = config.clone();271 let span = info_span!("build", host = field::display(&host.name));272 let hostname = host.name;273 let build_attr = build_attr.clone();274 // FIXME: Since the introduction of better-nix-eval,275 // due to single repl used for builds, hosts are waiting for each other to build,276 // instead of building concurrently.277 //278 // Open multiple repls?279 //280 // Create build batcher, which will behave similar to golangs281 // WaitGroup, and start executing once all the build tasks are scheduled?282 // This also allows to cleanup build output, as there will be no longer283 // "waiting for remote machine" messages in the cases when one package is needed for284 // multiple hosts.285 set.spawn_local(286 (async move {287 let built = match build_task(config, hostname.clone(), &build_attr).await {288 Ok(path) => path,289 Err(e) => {290 error!("failed to deploy host: {}", e);291 return;292 }293 };294 // TODO: Handle error295 let mut out = current_dir().expect("cwd exists");296 out.push(format!("built-{}", hostname));297298 info!("linking iso image to {:?}", out);299 if let Err(e) = symlink(built, out) {300 error!("failed to symlink: {e}")301 }302 })303 .instrument(span),304 );305 }306 set.await;307 Ok(())308 }309}310311impl Deploy {312 pub async fn run(self, config: &Config) -> Result<()> {313 let hosts = config.list_hosts().await?;314 let set = LocalSet::new();315 for host in hosts.into_iter() {316 if config.should_skip(&host.name) {317 continue;318 }319 let config = config.clone();320 let span = info_span!("deploy", host = field::display(&host.name));321 let hostname = host.name.clone();322 // FIXME: Fix repl concurrency (see build-systems)323 set.spawn_local(324 (async move {325 let built = match build_task(config.clone(), hostname.clone(), "toplevel").await326 {327 Ok(path) => path,328 Err(e) => {329 error!("failed to deploy host: {}", e);330 return;331 }332 };333 if !config.is_local(&hostname) {334 info!("uploading system closure");335 {336 // TODO: Move to remote_derivation method.337 // Alternatively, nix store make-content-addressed can be used,338 // at least for the first deployment, to provide trusted store key.339 //340 // It is much slower, yet doesn't require root on the deployer machine.341 let mut sign = MyCommand::new("nix");342 // Private key for host machine is registered in nix-sign.nix343 sign.arg("store")344 .arg("sign")345 .comparg("--key-file", "/etc/nix/private-key")346 .arg("-r")347 .arg(&built);348 if let Err(e) = sign.sudo().run_nix().await {349 warn!("Failed to sign store paths: {e}");350 };351 }352 let mut tries = 0;353 loop {354 match host.remote_derivation(&built).await {355 Ok(remote) => {356 assert!(remote == built, "CA derivations aren't implemented");357 break;358 }359 Err(e) if tries < 3 => {360 tries += 1;361 warn!("copy failure ({}/3): {}", tries, e);362 sleep(Duration::from_millis(5000)).await;363 }364 Err(e) => {365 error!("upload failed: {e}");366 return;367 }368 }369 }370 }371 if let Err(e) =372 deploy_task(self.action, &host, built, self.disable_rollback).await373 {374 error!("activation failed: {e}");375 }376 })377 .instrument(span),378 );379 }380 set.await;381 Ok(())382 }383}1use std::os::unix::fs::symlink;2use std::path::PathBuf;3use std::{env::current_dir, time::Duration};45use crate::command::MyCommand;6use crate::host::{Config, ConfigHost};7use crate::nix_go;8use anyhow::{anyhow, Result};9use clap::{Parser, ValueEnum};10use itertools::Itertools as _;11use tokio::{task::LocalSet, time::sleep};12use tracing::{error, field, info, info_span, warn, Instrument};1314#[derive(Parser)]15pub struct Deploy {16 /// Disable automatic rollback17 #[clap(long)]18 disable_rollback: bool,19 /// Action to execute after system is built20 action: DeployAction,21}2223#[derive(ValueEnum, Clone, Copy)]24enum DeployAction {25 /// Upload derivation, but do not execute the update.26 Upload,27 /// Upload and execute the activation script, old version will be used after reboot.28 Test,29 /// Upload and set as current system profile, but do not execute activation script.30 Boot,31 /// Upload, set current profile, and execute activation script.32 Switch,33}3435impl DeployAction {36 pub(crate) fn name(&self) -> Option<&'static str> {37 match self {38 DeployAction::Upload => None,39 DeployAction::Test => Some("test"),40 DeployAction::Boot => Some("boot"),41 DeployAction::Switch => Some("switch"),42 }43 }44 pub(crate) fn should_switch_profile(&self) -> bool {45 matches!(self, Self::Switch | Self::Boot)46 }47 pub(crate) fn should_activate(&self) -> bool {48 matches!(self, Self::Switch | Self::Test)49 }50 pub(crate) fn should_create_rollback_marker(&self) -> bool {51 // Upload does nothing on the target machine, other than uploading the closure.52 // In boot case we want to have rollback marker prepared, so that the system may rollback itself on the next boot.53 !matches!(self, Self::Upload)54 }55 pub(crate) fn should_schedule_rollback_run(&self) -> bool {56 matches!(self, Self::Switch | Self::Test)57 }58}5960#[derive(Parser, Clone)]61pub struct BuildSystems {62 /// Attribute to build. Systems are deployed from "toplevel" attr, well-known used attributes63 /// are "sdImage"/"isoImage", and your configuration may include any other build attributes.64 #[clap(long, default_value = "toplevel")]65 build_attr: String,66}6768struct Generation {69 id: u32,70 current: bool,71 datetime: String,72}73async fn get_current_generation(host: &ConfigHost) -> Result<Generation> {74 let mut cmd = host.cmd("nix-env").await?;75 cmd.comparg("--profile", "/nix/var/nix/profiles/system")76 .arg("--list-generations");77 // Sudo is required due to --list-generations acquiring lock on the profile.78 let data = cmd.sudo().run_string().await?;79 let generations = data80 .split('\n')81 .map(|e| e.trim())82 .filter(|&l| !l.is_empty())83 .filter_map(|g| {84 let gen: Option<Generation> = try {85 let mut parts = g.split_whitespace();86 let id = parts.next()?;87 let id: u32 = id.parse().ok()?;88 let date = parts.next()?;89 let time = parts.next()?;90 let current = if let Some(current) = parts.next() {91 if current == "(current)" {92 Some(true)93 } else {94 None95 }96 } else {97 Some(false)98 };99 let current = current?;100 if parts.next().is_some() {101 warn!("unexpected text after generation: {g}");102 }103 Generation {104 id,105 current,106 datetime: format!("{date} {time}"),107 }108 };109 if gen.is_none() {110 warn!("bad generation: {g}")111 }112 gen113 })114 .collect::<Vec<_>>();115 let current = generations116 .into_iter()117 .filter(|g| g.current)118 .at_most_one()119 .map_err(|_e| anyhow!("bad list-generations output"))?120 .ok_or_else(|| anyhow!("failed to find generation"))?;121 Ok(current)122}123124async fn deploy_task(125 action: DeployAction,126 host: &ConfigHost,127 built: PathBuf,128 disable_rollback: bool,129) -> Result<()> {130 let mut failed = false;131 // TODO: Lockfile, to prevent concurrent system switch?132 // TODO: If rollback target exists - bail, it should be removed. Lockfile will not work in case if rollback133 // is scheduler on next boot (default behavior). On current boot - rollback activator will fail due to134 // unit name conflict in systemd-run135 // This code is tied to rollback.nix136 if !disable_rollback && action.should_create_rollback_marker() {137 let _span = info_span!("preparing").entered();138 info!("preparing for rollback");139 let generation = get_current_generation(host).await?;140 info!(141 "rollback target would be {} {}",142 generation.id, generation.datetime143 );144 {145 let mut cmd = host.cmd("sh").await?;146 cmd.arg("-c").arg(format!("mark=$(mktemp -p /etc -t fleet_rollback_marker.XXXXX) && echo -n {} > $mark && mv --no-clobber $mark /etc/fleet_rollback_marker", generation.id));147 if let Err(e) = cmd.sudo().run().await {148 error!("failed to set rollback marker: {e}");149 failed = true;150 }151 }152 // Activation script also starts rollback-watchdog.timer, however, it is possible that it won't be started.153 // Kicking it on manually will work best.154 //155 // There wouldn't be conflict, because here we trigger start of the primary service, and systemd will156 // only allow one instance of it.157158 // TODO: We should also watch how this process is going.159 // After running this command, we have less than 3 minutes to deploy everything,160 // if we fail to perform generation switch in time, then we will still call the activation script, and this may break something.161 // Anyway, reboot will still help in this case.162 if action.should_schedule_rollback_run() {163 let mut cmd = host.cmd("systemd-run").await?;164 cmd.comparg("--on-active", "3min")165 .comparg("--unit", "rollback-watchdog-run")166 .arg("systemctl")167 .arg("start")168 .arg("rollback-watchdog.service");169 if let Err(e) = cmd.sudo().run().await {170 error!("failed to schedule rollback run: {e}");171 failed = true;172 }173 }174 }175176 if action.should_switch_profile() && !failed {177 info!("switching generation");178 let mut cmd = host.cmd("nix-env").await?;179 cmd.comparg("--profile", "/nix/var/nix/profiles/system")180 .comparg("--set", &built);181 if let Err(e) = cmd.sudo().run().await {182 error!("failed to switch generation: {e}");183 failed = true;184 }185 }186187 // FIXME: Connection might be disconnected after activation run188189 if action.should_activate() && !failed {190 let _span = info_span!("activating").entered();191 info!("executing activation script");192 let mut switch_script = built.clone();193 switch_script.push("bin");194 switch_script.push("switch-to-configuration");195 let mut cmd = host.cmd(switch_script).in_current_span().await?;196 cmd.arg(action.name().expect("upload.should_activate == false"));197 if let Err(e) = cmd.sudo().run().in_current_span().await {198 error!("failed to activate: {e}");199 failed = true;200 }201 }202 if action.should_create_rollback_marker() {203 if !disable_rollback {204 if failed {205 if action.should_schedule_rollback_run() {206 info!("executing rollback");207 if let Err(e) = host208 .systemctl_start("rollback-watchdog.service")209 .instrument(info_span!("rollback"))210 .await211 {212 error!("failed to trigger rollback: {e}")213 }214 }215 } else {216 info!("trying to mark upgrade as successful");217 if let Err(e) = host218 .rm_file("/etc/fleet_rollback_marker", true)219 .in_current_span()220 .await221 {222 error!("failed to remove rollback marker. This is bad, as the system will be rolled back by watchdog: {e}")223 }224 }225 info!("disarming watchdog, just in case");226 if let Err(_e) = host.systemctl_stop("rollback-watchdog.timer").await {227 // It is ok, if there was no reboot - then timer might not be running.228 }229 if action.should_schedule_rollback_run() {230 if let Err(e) = host.systemctl_stop("rollback-watchdog-run.timer").await {231 error!("failed to disarm rollback run: {e}");232 }233 }234 } else if let Err(_e) = host235 .rm_file("/etc/fleet_rollback_marker", true)236 .in_current_span()237 .await238 {239 // Marker might not exist, yet better try to remove it.240 }241 }242 Ok(())243}244245async fn build_task(config: Config, host: String, build_attr: &str) -> Result<PathBuf> {246 info!("building");247 let host = config.host(&host).await?;248 // let action = Action::from(self.subcommand.clone());249 let fleet_config = &config.config_field;250 let drv = nix_go!(251 fleet_config.hosts[{ &host.name }]252 .nixosSystem253 .config254 .system255 .build[{ build_attr }]256 );257 let outputs = drv.build().await.map_err(|e| {258 if build_attr == "sdImage" {259 info!("sd-image build failed");260 info!("Make sure you have imported modulesPath/installer/sd-card/sd-image-<arch>[-installer].nix (For installer, you may want to check config)");261 }262 e263 })?;264 let out_output = outputs265 .get("out")266 .ok_or_else(|| anyhow!("system build should produce \"out\" output"))?;267268 Ok(out_output.clone())269}270271impl BuildSystems {272 pub async fn run(self, config: &Config) -> Result<()> {273 let hosts = config.list_hosts().await?;274 let set = LocalSet::new();275 let build_attr = self.build_attr.clone();276 for host in hosts.into_iter() {277 if config.should_skip(&host.name) {278 continue;279 }280 let config = config.clone();281 let span = info_span!("build", host = field::display(&host.name));282 let hostname = host.name;283 let build_attr = build_attr.clone();284 // FIXME: Since the introduction of better-nix-eval,285 // due to single repl used for builds, hosts are waiting for each other to build,286 // instead of building concurrently.287 //288 // Open multiple repls?289 //290 // Create build batcher, which will behave similar to golangs291 // WaitGroup, and start executing once all the build tasks are scheduled?292 // This also allows to cleanup build output, as there will be no longer293 // "waiting for remote machine" messages in the cases when one package is needed for294 // multiple hosts.295 set.spawn_local(296 (async move {297 let built = match build_task(config, hostname.clone(), &build_attr).await {298 Ok(path) => path,299 Err(e) => {300 error!("failed to deploy host: {}", e);301 return;302 }303 };304 // TODO: Handle error305 let mut out = current_dir().expect("cwd exists");306 out.push(format!("built-{}", hostname));307308 info!("linking iso image to {:?}", out);309 if let Err(e) = symlink(built, out) {310 error!("failed to symlink: {e}")311 }312 })313 .instrument(span),314 );315 }316 set.await;317 Ok(())318 }319}320321impl Deploy {322 pub async fn run(self, config: &Config) -> Result<()> {323 let hosts = config.list_hosts().await?;324 let set = LocalSet::new();325 for host in hosts.into_iter() {326 if config.should_skip(&host.name) {327 continue;328 }329 let config = config.clone();330 let span = info_span!("deploy", host = field::display(&host.name));331 let hostname = host.name.clone();332 // FIXME: Fix repl concurrency (see build-systems)333 set.spawn_local(334 (async move {335 let built = match build_task(config.clone(), hostname.clone(), "toplevel").await336 {337 Ok(path) => path,338 Err(e) => {339 error!("failed to deploy host: {}", e);340 return;341 }342 };343 if !config.is_local(&hostname) {344 info!("uploading system closure");345 {346 // TODO: Move to remote_derivation method.347 // Alternatively, nix store make-content-addressed can be used,348 // at least for the first deployment, to provide trusted store key.349 //350 // It is much slower, yet doesn't require root on the deployer machine.351 let mut sign = MyCommand::new("nix");352 // Private key for host machine is registered in nix-sign.nix353 sign.arg("store")354 .arg("sign")355 .comparg("--key-file", "/etc/nix/private-key")356 .arg("-r")357 .arg(&built);358 if let Err(e) = sign.sudo().run_nix().await {359 warn!("Failed to sign store paths: {e}");360 };361 }362 let mut tries = 0;363 loop {364 match host.remote_derivation(&built).await {365 Ok(remote) => {366 assert!(remote == built, "CA derivations aren't implemented");367 break;368 }369 Err(e) if tries < 3 => {370 tries += 1;371 warn!("copy failure ({}/3): {}", tries, e);372 sleep(Duration::from_millis(5000)).await;373 }374 Err(e) => {375 error!("upload failed: {e}");376 return;377 }378 }379 }380 }381 if let Err(e) =382 deploy_task(self.action, &host, built, self.disable_rollback).await383 {384 error!("activation failed: {e}");385 }386 })387 .instrument(span),388 );389 }390 set.await;391 Ok(())392 }393}