difftreelog
feat automatic rollback
in: trunk
10 files changed
Cargo.lockdiffbeforeafterboth--- a/Cargo.lock
+++ b/Cargo.lock
@@ -610,6 +610,12 @@
]
[[package]]
+name = "either"
+version = "1.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07"
+
+[[package]]
name = "encode_unicode"
version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -684,6 +690,7 @@
"futures",
"hostname",
"indicatif",
+ "itertools",
"nixlike",
"once_cell",
"peg",
@@ -1127,6 +1134,15 @@
]
[[package]]
+name = "itertools"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57"
+dependencies = [
+ "either",
+]
+
+[[package]]
name = "itoa"
version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
Cargo.tomldiffbeforeafterboth--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,2 +1,3 @@
[workspace]
members = ["crates/*", "cmds/*"]
+resolver = "2"
cmds/fleet/Cargo.tomldiffbeforeafterboth--- a/cmds/fleet/Cargo.toml
+++ b/cmds/fleet/Cargo.toml
@@ -34,3 +34,4 @@
futures = "0.3.17"
tracing-indicatif = "0.3.5"
indicatif = "0.17.7"
+itertools = "0.11.0"
cmds/fleet/src/cmds/build_systems.rsdiffbeforeafterboth223use crate::command::MyCommand;3use crate::command::MyCommand;4use crate::host::Config;4use crate::host::Config;5use anyhow::Result;5use anyhow::{anyhow, Result};6use clap::Parser;6use clap::Parser;7use itertools::Itertools;7use tokio::{task::LocalSet, time::sleep};8use tokio::{task::LocalSet, time::sleep};8use tracing::{error, field, info, info_span, warn, Instrument};9use tracing::{error, field, info, info_span, warn, Instrument};91012 /// Do not continue on error13 /// Do not continue on error13 #[clap(long)]14 #[clap(long)]14 fail_fast: bool,15 fail_fast: bool,16 /// Disable automatic rollback17 #[clap(long)]18 disable_rollback: bool,15 /// Run builds as sudo19 /// Run builds as sudo16 #[clap(long)]20 #[clap(long)]17 privileged_build: bool,21 privileged_build: bool,39 pub(crate) fn should_activate(&self) -> bool {43 pub(crate) fn should_activate(&self) -> bool {40 matches!(self, Self::Switch | Self::Test)44 matches!(self, Self::Switch | Self::Test)41 }45 }46 pub(crate) fn should_schedule_rollback_run(&self) -> bool {47 matches!(self, Self::Switch | Self::Test)48 }42}49}435044enum PackageAction {51enum PackageAction {103 InstallationCd,110 InstallationCd,104}111}112113struct Generation {114 id: u32,115 current: bool,116 datetime: String,117}118async fn get_current_generation(config: &Config, host: &str) -> Result<Generation> {119 let mut cmd = MyCommand::new("nix-env");120 cmd.comparg("--profile", "/nix/var/nix/profiles/system")121 .arg("--list-generations");122 // Sudo is required due to --list-generations acquiring lock on the profile.123 let data = config.run_string_on(&host, cmd, true).await?;124 let generations = data125 .split('\n')126 .map(|e| e.trim())127 .filter(|&l| l != "")128 .filter_map(|g| {129 let gen: Option<Generation> = try {130 let mut parts = g.split_whitespace();131 let id = parts.next()?;132 let id: u32 = id.parse().ok()?;133 let date = parts.next()?;134 let time = parts.next()?;135 let current = if let Some(current) = parts.next() {136 if current == "(current)" {137 Some(true)138 } else {139 None140 }141 } else {142 Some(false)143 };144 let current = current?;145 if parts.next().is_some() {146 warn!("unexpected text after generation: {g}");147 }148 Generation {149 id,150 current,151 datetime: format!("{date} {time}"),152 }153 };154 if gen.is_none() {155 warn!("bad generation: {g}")156 }157 gen158 })159 .collect::<Vec<_>>();160 let current = generations161 .into_iter()162 .filter(|g| g.current)163 .at_most_one()164 .map_err(|_e| anyhow!("bad list-generations output"))?165 .ok_or_else(|| anyhow!("failed to find generation"))?;166 Ok(current)167}105168106impl BuildSystems {169impl BuildSystems {107 async fn build_task(self, config: Config, host: String) -> Result<()> {170 async fn build_task(self, config: Config, host: String) -> Result<()> {155 loop {218 loop {156 let mut nix = MyCommand::new("nix");219 let mut nix = MyCommand::new("nix");157 nix.arg("copy")220 nix.arg("copy")221 .arg("--substitute-on-destination")158 .comparg("--to", format!("ssh://root@{host}"))222 .comparg("--to", format!("ssh://root@{host}"))159 .arg(&built);223 .arg(&built);160 match nix.run_nix().await {224 match nix.run_nix().await {169 }233 }170 }234 }171 if let Some(action) = action {235 if let Some(action) = action {236 let mut failed = false;237 // TODO: Lockfile, to prevent concurrent system switch?238 // TODO: If rollback target exists - bail, it should be removed. Lockfile will not work in case if rollback239 // is scheduler on next boot (default behavior). On current boot - rollback activator will fail due to240 // unit name conflict in systemd-run241 if !self.disable_rollback {242 let _span = info_span!("preparing").entered();243 info!("preparing for rollback");244 let generation = get_current_generation(&config, &host).await?;245 info!(246 "rollback target would be {} {}",247 generation.id, generation.datetime248 );249 {250 let mut cmd = MyCommand::new("sh");251 cmd.arg("-c").arg(format!("mark=$(mktemp -p /etc -t fleet_rollback_marker.XXXXX) && echo -n {} > $mark && mv --no-clobber $mark /etc/fleet_rollback_marker", generation.id));252 if let Err(e) = config.run_on(&host, cmd, true).await {253 error!("failed to set rollback marker: {e}");254 failed = true;255 }256 }257 // Activation script also starts rollback-watchdog.timer, however, it is possible that it won't be started.258 // Kicking it on manually will work best.259 //260 // There wouldn't be conflict, because here we trigger start of the primary service, and systemd will261 // only allow one instance of it.262 if action.should_schedule_rollback_run() {263 let mut cmd = MyCommand::new("systemd-run");264 cmd.comparg("--on-active", "3min")265 .comparg("--unit", "rollback-watchdog-run")266 .arg("systemctl")267 .arg("start")268 .arg("rollback-watchdog.service");269 if let Err(e) = config.run_on(&host, cmd, true).await {270 error!("failed to schedule rollback run: {e}");271 failed = true;272 }273 }274 }172 if action.should_switch_profile() {275 if action.should_switch_profile() && !failed {173 info!("switching generation");276 info!("switching generation");174 let mut cmd = MyCommand::new("nix-env");277 let mut cmd = MyCommand::new("nix-env");175 cmd.comparg("--profile", "/nix/var/nix/profiles/system")278 cmd.comparg("--profile", "/nix/var/nix/profiles/system")176 .comparg("--set", &built);279 .comparg("--set", &built);177 config.run_on(&host, cmd, true).await?;280 if let Err(e) = config.run_on(&host, cmd, true).await {281 error!("failed to switch generation: {e}");282 failed = true;283 }178 }284 }179 if action.should_activate() {285 if action.should_activate() && !failed {286 let _span = info_span!("activating").entered();180 info!("executing activation script");287 info!("executing activation script");181 let mut switch_script = built.clone();288 let mut switch_script = built.clone();182 switch_script.push("bin");289 switch_script.push("bin");183 switch_script.push("switch-to-configuration");290 switch_script.push("switch-to-configuration");184 let mut cmd = MyCommand::new(switch_script);291 let mut cmd = MyCommand::new(switch_script);185 cmd.arg(action.name());292 cmd.arg(action.name());186 config.run_on(&host, cmd, true).await?;293 if let Err(e) = config.run_on(&host, cmd, true).in_current_span().await {294 error!("failed to activate: {e}");295 failed = true;296 }187 }297 }298 if !self.disable_rollback {299 {300 let _span = info_span!("rollback").entered();301 if failed {302 info!("executing rollback");303 let mut cmd = MyCommand::new("systemctl");304 cmd.arg("start").arg("rollback-watchdog.service");305 if let Err(e) = config.run_on(&host, cmd, true).await {306 error!("failed to rollback: {e}");307 }308 } else {309 info!("marking upgrade as successful");310 let mut cmd = MyCommand::new("rm");311 cmd.arg("-f").arg("/etc/fleet_rollback_marker");312 if let Err(e) =313 config.run_on(&host, cmd, true).in_current_span().await314 {315 error!("failed to remove rollback marker. This is bad, as the system will be rolled back by watchdog: {e}")316 }317 }318 }319 {320 let _span = info_span!("disarm").entered();321 info!("disarming watchdog, just in case");322 {323 let mut cmd = MyCommand::new("systemctl");324 cmd.arg("stop").arg("rollback-watchdog.timer");325 if let Err(_e) = config.run_on(&host, cmd, true).await {326 // It is ok, if there was no reboot.327 }328 }329 if action.should_schedule_rollback_run() {330 let mut cmd = MyCommand::new("systemctl");331 cmd.arg("stop").arg("rollback-watchdog-run.timer");332 if let Err(e) = config.run_on(&host, cmd, true).await {333 error!("failed to disarm rollback run: {e}");334 }335 }336 }337 }188 }338 }189 }339 }190 Action::Package(PackageAction::SdImage) => {340 Action::Package(PackageAction::SdImage) => {cmds/fleet/src/command.rsdiffbeforeafterboth--- a/cmds/fleet/src/command.rs
+++ b/cmds/fleet/src/command.rs
@@ -143,12 +143,14 @@
pub async fn run_nix_string(self) -> Result<String> {
let str = self.clone().into_string();
- let cmd = self.into_command();
+ let mut cmd = self.into_command();
+ cmd.arg("--log-format").arg("internal-json");
run_nix_inner_stdout(str, cmd, &mut NixHandler::default()).await
}
pub async fn run_nix(self) -> Result<()> {
let str = self.clone().into_string();
let mut cmd = self.into_command();
+ cmd.arg("--log-format").arg("internal-json");
cmd.stdout(Stdio::inherit());
run_nix_inner(str, cmd, &mut NixHandler::default()).await
}
@@ -410,7 +412,6 @@
handler: &mut dyn Handler,
) -> Result<Option<String>> {
info!("running {str}");
- cmd.arg("--log-format").arg("internal-json");
cmd.stderr(Stdio::piped());
cmd.stdout(Stdio::piped());
let mut child = cmd.spawn()?;
cmds/fleet/src/main.rsdiffbeforeafterboth--- a/cmds/fleet/src/main.rs
+++ b/cmds/fleet/src/main.rs
@@ -1,3 +1,5 @@
+#![feature(try_blocks)]
+
pub mod cmds;
pub mod command;
pub mod host;
@@ -6,16 +8,14 @@
mod fleetdata;
use std::ffi::OsString;
-use std::io;
use std::time::Duration;
-use anyhow::{anyhow, bail, Result};
+use anyhow::{bail, Result};
use clap::Parser;
use cmds::{build_systems::BuildSystems, info::Info, secrets::Secrets};
use host::{Config, FleetOpts};
use indicatif::{ProgressState, ProgressStyle};
-use tokio::fs;
use tokio::process::Command;
use tracing::{info, metadata::LevelFilter};
use tracing_indicatif::IndicatifLayer;
@@ -79,9 +79,6 @@
Opts::Prefetch(p) => p.run(config).await?,
};
Ok(())
-}
-fn elapsed_subsec(state: &ProgressState, writer: &mut dyn std::fmt::Write) {
- let _ = writer.write_str(&format!("{:?}", state.elapsed()));
}
#[tokio::main]
cmds/install-secrets/Cargo.tomldiffbeforeafterboth--- a/cmds/install-secrets/Cargo.toml
+++ b/cmds/install-secrets/Cargo.toml
@@ -9,7 +9,7 @@
env_logger = "0.10.0"
log = "0.4.14"
nix = "0.26.1"
-serde = "1.0.130"
+serde = { version = "1.0.130", features = ["derive"] }
serde_json = "1.0.89"
clap = { version = "4.0.29", features = [
"derive",
nixos/modules/module-list.nixdiffbeforeafterboth--- a/nixos/modules/module-list.nix
+++ b/nixos/modules/module-list.nix
@@ -2,4 +2,5 @@
../fleetPkgs.nix
../meta.nix
../secrets.nix
+ ../rollback.nix
]
nixos/rollback.nixdiffbeforeafterboth--- /dev/null
+++ b/nixos/rollback.nix
@@ -0,0 +1,45 @@
+{config, ...}: {
+ # TODO: Make it work with systemd-initrd approach.
+ # In this case we can't just switch generation and re-run activation script, since the root filesystem might not be
+ # mounted yet. We need to explicitly remove the last generation, and this needs deeper integration with systemd/grub/
+ # whatever user uses. boot.json also might help here.
+
+ systemd.services.rollback-watchdog = {
+ description = "Rollback watchdog";
+ script = ''
+ set -eu
+ if [ -f /etc/fleet_rollback_marker ]; then
+ echo "found the rollback marker, switching to older generation"
+ target=$(cat /etc/fleet_rollback_marker)
+ echo "rolling back profile"
+ nix profile rollback --profile /nix/var/nix/profiles/system --to "$target"
+ echo "executing activation script"
+ "/nix/var/nix/profiles/system-$target-link/bin/switch-to-configuration" switch
+ echo "removing rollback marker"
+ rm -f /etc/fleet_rollback_marker
+ else
+ echo "rollback marker was removed, upgrade is succeeded"
+ fi
+ '';
+ path = [
+ # Should have nix-command support
+ config.nix.package
+ ];
+ serviceConfig.Type = "exec";
+ unitConfig = {
+ X-StopOnRemoval = false;
+ };
+ };
+
+ systemd.timers.rollback-watchdog = {
+ description = "Timer for rollback watchdog";
+ wantedBy = ["timers.target"];
+ timerConfig = {
+ OnUnitActiveSec = "3min";
+ RemainAfterElapse = false;
+ };
+ unitConfig = {
+ ConditionPathExists = "/etc/fleet_rollback_marker";
+ };
+ };
+}
pkgs/fleet-install-secrets.nixdiffbeforeafterboth--- a/pkgs/fleet-install-secrets.nix
+++ b/pkgs/fleet-install-secrets.nix
@@ -6,7 +6,7 @@
name = "${pname}-${version}";
src = ../.;
- cargoBuildFlags = "-p ${pname}";
+ buildAndTestSubdir = "cmds/install-secrets";
cargoLock = {
lockFile = ../Cargo.lock;
outputHashes = {