From 741106e60111f921f02a18c520f240047564017c Mon Sep 17 00:00:00 2001 From: Yaroslav Bolyukin Date: Sun, 15 Oct 2023 00:35:00 +0000 Subject: [PATCH] feat: automatic rollback --- --- a/Cargo.lock +++ b/Cargo.lock @@ -610,6 +610,12 @@ ] [[package]] +name = "either" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" + +[[package]] name = "encode_unicode" version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -684,6 +690,7 @@ "futures", "hostname", "indicatif", + "itertools", "nixlike", "once_cell", "peg", @@ -1127,6 +1134,15 @@ ] [[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + +[[package]] name = "itoa" version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" --- a/Cargo.toml +++ b/Cargo.toml @@ -1,2 +1,3 @@ [workspace] members = ["crates/*", "cmds/*"] +resolver = "2" --- a/cmds/fleet/Cargo.toml +++ b/cmds/fleet/Cargo.toml @@ -34,3 +34,4 @@ futures = "0.3.17" tracing-indicatif = "0.3.5" indicatif = "0.17.7" +itertools = "0.11.0" --- a/cmds/fleet/src/cmds/build_systems.rs +++ b/cmds/fleet/src/cmds/build_systems.rs @@ -2,8 +2,9 @@ use crate::command::MyCommand; use crate::host::Config; -use anyhow::Result; +use anyhow::{anyhow, Result}; use clap::Parser; +use itertools::Itertools; use tokio::{task::LocalSet, time::sleep}; use tracing::{error, field, info, info_span, warn, Instrument}; @@ -12,6 +13,9 @@ /// Do not continue on error #[clap(long)] fail_fast: bool, + /// Disable automatic rollback + #[clap(long)] + disable_rollback: bool, /// Run builds as sudo #[clap(long)] privileged_build: bool, @@ -39,6 +43,9 @@ pub(crate) fn should_activate(&self) -> bool { matches!(self, Self::Switch | Self::Test) } + pub(crate) fn should_schedule_rollback_run(&self) -> bool { + matches!(self, Self::Switch | Self::Test) + } } enum PackageAction { @@ -103,6 +110,62 @@ InstallationCd, } +struct Generation { + id: u32, + current: bool, + datetime: String, +} +async fn get_current_generation(config: &Config, host: &str) -> Result { + let mut cmd = MyCommand::new("nix-env"); + cmd.comparg("--profile", "/nix/var/nix/profiles/system") + .arg("--list-generations"); + // Sudo is required due to --list-generations acquiring lock on the profile. + let data = config.run_string_on(&host, cmd, true).await?; + let generations = data + .split('\n') + .map(|e| e.trim()) + .filter(|&l| l != "") + .filter_map(|g| { + let gen: Option = try { + let mut parts = g.split_whitespace(); + let id = parts.next()?; + let id: u32 = id.parse().ok()?; + let date = parts.next()?; + let time = parts.next()?; + let current = if let Some(current) = parts.next() { + if current == "(current)" { + Some(true) + } else { + None + } + } else { + Some(false) + }; + let current = current?; + if parts.next().is_some() { + warn!("unexpected text after generation: {g}"); + } + Generation { + id, + current, + datetime: format!("{date} {time}"), + } + }; + if gen.is_none() { + warn!("bad generation: {g}") + } + gen + }) + .collect::>(); + let current = generations + .into_iter() + .filter(|g| g.current) + .at_most_one() + .map_err(|_e| anyhow!("bad list-generations output"))? + .ok_or_else(|| anyhow!("failed to find generation"))?; + Ok(current) +} + impl BuildSystems { async fn build_task(self, config: Config, host: String) -> Result<()> { info!("building"); @@ -155,6 +218,7 @@ loop { let mut nix = MyCommand::new("nix"); nix.arg("copy") + .arg("--substitute-on-destination") .comparg("--to", format!("ssh://root@{host}")) .arg(&built); match nix.run_nix().await { @@ -169,21 +233,107 @@ } } if let Some(action) = action { - if action.should_switch_profile() { + let mut failed = false; + // TODO: Lockfile, to prevent concurrent system switch? + // TODO: If rollback target exists - bail, it should be removed. Lockfile will not work in case if rollback + // is scheduler on next boot (default behavior). On current boot - rollback activator will fail due to + // unit name conflict in systemd-run + if !self.disable_rollback { + let _span = info_span!("preparing").entered(); + info!("preparing for rollback"); + let generation = get_current_generation(&config, &host).await?; + info!( + "rollback target would be {} {}", + generation.id, generation.datetime + ); + { + let mut cmd = MyCommand::new("sh"); + cmd.arg("-c").arg(format!("mark=$(mktemp -p /etc -t fleet_rollback_marker.XXXXX) && echo -n {} > $mark && mv --no-clobber $mark /etc/fleet_rollback_marker", generation.id)); + if let Err(e) = config.run_on(&host, cmd, true).await { + error!("failed to set rollback marker: {e}"); + failed = true; + } + } + // Activation script also starts rollback-watchdog.timer, however, it is possible that it won't be started. + // Kicking it on manually will work best. + // + // There wouldn't be conflict, because here we trigger start of the primary service, and systemd will + // only allow one instance of it. + if action.should_schedule_rollback_run() { + let mut cmd = MyCommand::new("systemd-run"); + cmd.comparg("--on-active", "3min") + .comparg("--unit", "rollback-watchdog-run") + .arg("systemctl") + .arg("start") + .arg("rollback-watchdog.service"); + if let Err(e) = config.run_on(&host, cmd, true).await { + error!("failed to schedule rollback run: {e}"); + failed = true; + } + } + } + if action.should_switch_profile() && !failed { info!("switching generation"); let mut cmd = MyCommand::new("nix-env"); cmd.comparg("--profile", "/nix/var/nix/profiles/system") .comparg("--set", &built); - config.run_on(&host, cmd, true).await?; + if let Err(e) = config.run_on(&host, cmd, true).await { + error!("failed to switch generation: {e}"); + failed = true; + } } - if action.should_activate() { + if action.should_activate() && !failed { + let _span = info_span!("activating").entered(); info!("executing activation script"); let mut switch_script = built.clone(); switch_script.push("bin"); switch_script.push("switch-to-configuration"); let mut cmd = MyCommand::new(switch_script); cmd.arg(action.name()); - config.run_on(&host, cmd, true).await?; + if let Err(e) = config.run_on(&host, cmd, true).in_current_span().await { + error!("failed to activate: {e}"); + failed = true; + } + } + if !self.disable_rollback { + { + let _span = info_span!("rollback").entered(); + if failed { + info!("executing rollback"); + let mut cmd = MyCommand::new("systemctl"); + cmd.arg("start").arg("rollback-watchdog.service"); + if let Err(e) = config.run_on(&host, cmd, true).await { + error!("failed to rollback: {e}"); + } + } else { + info!("marking upgrade as successful"); + let mut cmd = MyCommand::new("rm"); + cmd.arg("-f").arg("/etc/fleet_rollback_marker"); + if let Err(e) = + config.run_on(&host, cmd, true).in_current_span().await + { + error!("failed to remove rollback marker. This is bad, as the system will be rolled back by watchdog: {e}") + } + } + } + { + let _span = info_span!("disarm").entered(); + info!("disarming watchdog, just in case"); + { + let mut cmd = MyCommand::new("systemctl"); + cmd.arg("stop").arg("rollback-watchdog.timer"); + if let Err(_e) = config.run_on(&host, cmd, true).await { + // It is ok, if there was no reboot. + } + } + if action.should_schedule_rollback_run() { + let mut cmd = MyCommand::new("systemctl"); + cmd.arg("stop").arg("rollback-watchdog-run.timer"); + if let Err(e) = config.run_on(&host, cmd, true).await { + error!("failed to disarm rollback run: {e}"); + } + } + } } } } --- a/cmds/fleet/src/command.rs +++ b/cmds/fleet/src/command.rs @@ -143,12 +143,14 @@ pub async fn run_nix_string(self) -> Result { let str = self.clone().into_string(); - let cmd = self.into_command(); + let mut cmd = self.into_command(); + cmd.arg("--log-format").arg("internal-json"); run_nix_inner_stdout(str, cmd, &mut NixHandler::default()).await } pub async fn run_nix(self) -> Result<()> { let str = self.clone().into_string(); let mut cmd = self.into_command(); + cmd.arg("--log-format").arg("internal-json"); cmd.stdout(Stdio::inherit()); run_nix_inner(str, cmd, &mut NixHandler::default()).await } @@ -410,7 +412,6 @@ handler: &mut dyn Handler, ) -> Result> { info!("running {str}"); - cmd.arg("--log-format").arg("internal-json"); cmd.stderr(Stdio::piped()); cmd.stdout(Stdio::piped()); let mut child = cmd.spawn()?; --- a/cmds/fleet/src/main.rs +++ b/cmds/fleet/src/main.rs @@ -1,3 +1,5 @@ +#![feature(try_blocks)] + pub mod cmds; pub mod command; pub mod host; @@ -6,16 +8,14 @@ mod fleetdata; use std::ffi::OsString; -use std::io; use std::time::Duration; -use anyhow::{anyhow, bail, Result}; +use anyhow::{bail, Result}; use clap::Parser; use cmds::{build_systems::BuildSystems, info::Info, secrets::Secrets}; use host::{Config, FleetOpts}; use indicatif::{ProgressState, ProgressStyle}; -use tokio::fs; use tokio::process::Command; use tracing::{info, metadata::LevelFilter}; use tracing_indicatif::IndicatifLayer; @@ -79,9 +79,6 @@ Opts::Prefetch(p) => p.run(config).await?, }; Ok(()) -} -fn elapsed_subsec(state: &ProgressState, writer: &mut dyn std::fmt::Write) { - let _ = writer.write_str(&format!("{:?}", state.elapsed())); } #[tokio::main] --- a/cmds/install-secrets/Cargo.toml +++ b/cmds/install-secrets/Cargo.toml @@ -9,7 +9,7 @@ env_logger = "0.10.0" log = "0.4.14" nix = "0.26.1" -serde = "1.0.130" +serde = { version = "1.0.130", features = ["derive"] } serde_json = "1.0.89" clap = { version = "4.0.29", features = [ "derive", --- a/nixos/modules/module-list.nix +++ b/nixos/modules/module-list.nix @@ -2,4 +2,5 @@ ../fleetPkgs.nix ../meta.nix ../secrets.nix + ../rollback.nix ] --- /dev/null +++ b/nixos/rollback.nix @@ -0,0 +1,45 @@ +{config, ...}: { + # TODO: Make it work with systemd-initrd approach. + # In this case we can't just switch generation and re-run activation script, since the root filesystem might not be + # mounted yet. We need to explicitly remove the last generation, and this needs deeper integration with systemd/grub/ + # whatever user uses. boot.json also might help here. + + systemd.services.rollback-watchdog = { + description = "Rollback watchdog"; + script = '' + set -eu + if [ -f /etc/fleet_rollback_marker ]; then + echo "found the rollback marker, switching to older generation" + target=$(cat /etc/fleet_rollback_marker) + echo "rolling back profile" + nix profile rollback --profile /nix/var/nix/profiles/system --to "$target" + echo "executing activation script" + "/nix/var/nix/profiles/system-$target-link/bin/switch-to-configuration" switch + echo "removing rollback marker" + rm -f /etc/fleet_rollback_marker + else + echo "rollback marker was removed, upgrade is succeeded" + fi + ''; + path = [ + # Should have nix-command support + config.nix.package + ]; + serviceConfig.Type = "exec"; + unitConfig = { + X-StopOnRemoval = false; + }; + }; + + systemd.timers.rollback-watchdog = { + description = "Timer for rollback watchdog"; + wantedBy = ["timers.target"]; + timerConfig = { + OnUnitActiveSec = "3min"; + RemainAfterElapse = false; + }; + unitConfig = { + ConditionPathExists = "/etc/fleet_rollback_marker"; + }; + }; +} --- a/pkgs/fleet-install-secrets.nix +++ b/pkgs/fleet-install-secrets.nix @@ -6,7 +6,7 @@ name = "${pname}-${version}"; src = ../.; - cargoBuildFlags = "-p ${pname}"; + buildAndTestSubdir = "cmds/install-secrets"; cargoLock = { lockFile = ../Cargo.lock; outputHashes = { -- gitstuff