Skip to main content

monitord/
boot.rs

1//! # boot module
2//!
3//! Collects boot blame metrics showing the slowest units at boot.
4//! Similar to `systemd-analyze blame` but stores N slowest units.
5
6use std::array::TryFromSliceError;
7use std::collections::HashMap;
8use std::io::ErrorKind;
9use std::num::TryFromIntError;
10use std::path::{Path, PathBuf};
11use std::sync::Arc;
12
13use anyhow::Result;
14use tokio::sync::RwLock;
15use tracing::debug;
16use zbus::zvariant::ObjectPath;
17
18use crate::config::Config;
19use crate::dbus::zbus_systemd::ManagerProxy;
20use crate::dbus::zbus_unit::UnitProxy;
21use crate::MachineStats;
22
23/// Boot blame statistics: maps unit name to activation time in seconds
24pub type BootBlameStats = HashMap<String, f64>;
25
26const BOOT_ID_PATH: &str = "/proc/sys/kernel/random/boot_id";
27const BOOT_BLAME_CACHE_DIR: &str = "/run/monitord";
28const BOOT_BLAME_CACHE_SUFFIX: &str = "boot_blame.bin";
29
30type BootCacheResult<T> = std::result::Result<T, BootCacheError>;
31
32#[derive(Debug, thiserror::Error)]
33enum BootCacheError {
34    #[error("boot cache I/O error: {0}")]
35    Io(#[from] std::io::Error),
36    #[error("boot id from {BOOT_ID_PATH} was empty")]
37    EmptyBootId,
38    #[error("boot cache payload decode error: {0}")]
39    InvalidPayload(&'static str),
40    #[error("boot cache UTF-8 decode error: {0}")]
41    Utf8(#[from] std::string::FromUtf8Error),
42    #[error("boot cache integer conversion error: {0}")]
43    IntConversion(#[from] TryFromIntError),
44    #[error("boot cache slice conversion error: {0}")]
45    SliceConversion(#[from] TryFromSliceError),
46}
47
48fn cache_file_path(cache_dir: &Path, boot_id: &str) -> PathBuf {
49    cache_dir.join(format!("{boot_id}.{BOOT_BLAME_CACHE_SUFFIX}"))
50}
51
52async fn get_boot_id() -> BootCacheResult<String> {
53    let boot_id = tokio::fs::read_to_string(BOOT_ID_PATH).await?;
54    let boot_id = boot_id.trim().to_string();
55    if boot_id.is_empty() {
56        return Err(BootCacheError::EmptyBootId);
57    }
58    Ok(boot_id)
59}
60
61fn encode_boot_blame_stats(stats: &BootBlameStats) -> BootCacheResult<Vec<u8>> {
62    let mut out = Vec::new();
63    let entry_count = u32::try_from(stats.len())?;
64    out.extend_from_slice(&entry_count.to_le_bytes());
65
66    for (unit_name, activation_time) in stats {
67        let unit_name_bytes = unit_name.as_bytes();
68        let unit_name_len = u32::try_from(unit_name_bytes.len())?;
69        out.extend_from_slice(&unit_name_len.to_le_bytes());
70        out.extend_from_slice(unit_name_bytes);
71        out.extend_from_slice(&activation_time.to_le_bytes());
72    }
73
74    Ok(out)
75}
76
77fn decode_boot_blame_stats(content: &[u8]) -> BootCacheResult<BootBlameStats> {
78    const U32_BYTES: usize = std::mem::size_of::<u32>();
79    const F64_BYTES: usize = std::mem::size_of::<f64>();
80    fn read_u32(bytes: &[u8], offset: &mut usize) -> BootCacheResult<u32> {
81        if *offset + std::mem::size_of::<u32>() > bytes.len() {
82            return Err(BootCacheError::InvalidPayload("unexpected end of payload"));
83        }
84        let value =
85            u32::from_le_bytes(bytes[*offset..*offset + std::mem::size_of::<u32>()].try_into()?);
86        *offset += std::mem::size_of::<u32>();
87        Ok(value)
88    }
89
90    if content.len() < U32_BYTES {
91        return Err(BootCacheError::InvalidPayload("payload too small"));
92    }
93
94    let mut offset = 0usize;
95    let entry_count = read_u32(content, &mut offset)? as usize;
96    let mut stats = BootBlameStats::with_capacity(entry_count);
97
98    for _ in 0..entry_count {
99        let name_len = read_u32(content, &mut offset)? as usize;
100        if offset + name_len + F64_BYTES > content.len() {
101            return Err(BootCacheError::InvalidPayload("invalid payload size"));
102        }
103        let unit_name = String::from_utf8(content[offset..offset + name_len].to_vec())?;
104        offset += name_len;
105        let activation_time = f64::from_le_bytes(content[offset..offset + F64_BYTES].try_into()?);
106        offset += F64_BYTES;
107        stats.insert(unit_name, activation_time);
108    }
109
110    if offset != content.len() {
111        return Err(BootCacheError::InvalidPayload("trailing bytes in payload"));
112    }
113
114    Ok(stats)
115}
116
117async fn read_cached_boot_blame_from_dir(
118    cache_dir: &Path,
119    boot_id: &str,
120) -> BootCacheResult<Option<BootBlameStats>> {
121    let cache_path = cache_file_path(cache_dir, boot_id);
122    let content = match tokio::fs::read(&cache_path).await {
123        Ok(content) => content,
124        Err(err) if err.kind() == ErrorKind::NotFound => return Ok(None),
125        Err(err) => return Err(err.into()),
126    };
127    Ok(Some(decode_boot_blame_stats(&content)?))
128}
129
130async fn write_cached_boot_blame_to_dir(
131    cache_dir: &Path,
132    boot_id: &str,
133    stats: &BootBlameStats,
134) -> BootCacheResult<()> {
135    tokio::fs::create_dir_all(cache_dir).await?;
136    let cache_path = cache_file_path(cache_dir, boot_id);
137    let encoded = encode_boot_blame_stats(stats)?;
138    tokio::fs::write(cache_path, encoded).await?;
139    Ok(())
140}
141
142async fn read_cached_boot_blame(boot_id: &str) -> BootCacheResult<Option<BootBlameStats>> {
143    read_cached_boot_blame_from_dir(Path::new(BOOT_BLAME_CACHE_DIR), boot_id).await
144}
145
146async fn write_cached_boot_blame(boot_id: &str, stats: &BootBlameStats) -> BootCacheResult<()> {
147    write_cached_boot_blame_to_dir(Path::new(BOOT_BLAME_CACHE_DIR), boot_id, stats).await
148}
149
150/// Calculate the activation time for a unit
151/// Returns the time in seconds from InactiveExitTimestamp to ActiveEnterTimestamp
152async fn get_unit_activation_time(
153    connection: &zbus::Connection,
154    unit_path: &ObjectPath<'_>,
155) -> Result<f64> {
156    let unit_proxy = UnitProxy::builder(connection)
157        .cache_properties(zbus::proxy::CacheProperties::No)
158        .path(unit_path)?
159        .build()
160        .await?;
161
162    let inactive_exit = unit_proxy.inactive_exit_timestamp().await?;
163    let active_enter = unit_proxy.active_enter_timestamp().await?;
164
165    // If either timestamp is 0, the unit hasn't been activated or the timing is invalid
166    if inactive_exit == 0 || active_enter == 0 {
167        return Ok(0.0);
168    }
169
170    // Calculate activation time in seconds (timestamps are in microseconds)
171    let activation_time_usec = active_enter.saturating_sub(inactive_exit);
172    let activation_time_sec = activation_time_usec as f64 / 1_000_000.0;
173
174    Ok(activation_time_sec)
175}
176
177/// Update boot blame statistics with the N slowest units at boot
178pub async fn update_boot_blame_stats(
179    config: Arc<Config>,
180    connection: zbus::Connection,
181    machine_stats: Arc<RwLock<MachineStats>>,
182) -> Result<()> {
183    debug!("Starting boot blame stats collection");
184
185    let mut maybe_boot_id = None;
186    if config.boot_blame.cache_enabled {
187        let cached_stats = machine_stats.read().await.boot_blame.clone();
188        if cached_stats.is_some() {
189            debug!("Using in-memory cached boot blame stats");
190            return Ok(());
191        }
192
193        match get_boot_id().await {
194            Ok(boot_id) => {
195                match read_cached_boot_blame(&boot_id).await {
196                    Ok(Some(cached_boot_blame)) => {
197                        let cache_path = cache_file_path(Path::new(BOOT_BLAME_CACHE_DIR), &boot_id);
198                        debug!(
199                            "Using cached boot blame stats from {}",
200                            cache_path.display()
201                        );
202                        machine_stats.write().await.boot_blame = Some(cached_boot_blame);
203                        return Ok(());
204                    }
205                    Ok(None) => {
206                        debug!("No cached boot blame stats found for boot id {}", boot_id);
207                    }
208                    Err(err) => {
209                        debug!(
210                            "Failed to load boot blame cache for boot id {}: {}",
211                            boot_id, err
212                        );
213                    }
214                }
215                maybe_boot_id = Some(boot_id);
216            }
217            Err(err) => {
218                debug!("Failed to retrieve boot id for boot blame cache: {}", err);
219            }
220        }
221    }
222
223    let systemd_proxy = ManagerProxy::builder(&connection)
224        .cache_properties(zbus::proxy::CacheProperties::No)
225        .build()
226        .await?;
227    let units = systemd_proxy.list_units().await?;
228
229    let mut unit_times: Vec<(String, f64)> = Vec::new();
230
231    // Collect activation times for all units
232    for unit_info in units {
233        let unit_name = unit_info.0;
234        let unit_path = unit_info.6;
235
236        // Apply blocklist: skip units explicitly excluded
237        if config.boot_blame.blocklist.contains(&unit_name) {
238            debug!("Skipping boot blame for {} due to blocklist", &unit_name);
239            continue;
240        }
241        // Apply allowlist: if non-empty, only include listed units
242        if !config.boot_blame.allowlist.is_empty()
243            && !config.boot_blame.allowlist.contains(&unit_name)
244        {
245            continue;
246        }
247
248        match get_unit_activation_time(&connection, &unit_path).await {
249            Ok(time) if time > 0.0 => {
250                unit_times.push((unit_name, time));
251            }
252            Ok(_) => {
253                // Unit has no activation time (0.0), skip it
254            }
255            Err(e) => {
256                debug!("Failed to get activation time for {}: {}", unit_name, e);
257            }
258        }
259    }
260
261    // Sort by activation time in descending order (slowest first)
262    unit_times.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
263
264    // Take only the N slowest units
265    let num_slowest = config.boot_blame.num_slowest_units as usize;
266    unit_times.truncate(num_slowest);
267
268    // Convert to HashMap
269    let boot_blame_stats: BootBlameStats = unit_times.into_iter().collect();
270
271    debug!("Collected {} boot blame stats", boot_blame_stats.len());
272
273    // Update machine stats
274    let mut stats = machine_stats.write().await;
275    stats.boot_blame = Some(boot_blame_stats);
276    if config.boot_blame.cache_enabled {
277        if let Some(boot_id) = maybe_boot_id {
278            if let Some(cached_stats) = stats.boot_blame.as_ref() {
279                if let Err(err) = write_cached_boot_blame(&boot_id, cached_stats).await {
280                    debug!(
281                        "Failed to write boot blame cache for boot id {} to {}: {}",
282                        boot_id, BOOT_BLAME_CACHE_DIR, err
283                    );
284                } else {
285                    debug!("Updated boot blame cache for boot id {}", boot_id);
286                }
287            }
288        }
289    }
290
291    Ok(())
292}
293
294#[cfg(test)]
295mod tests {
296    use super::*;
297
298    #[test]
299    fn test_boot_blame_cache_encode_decode_roundtrip() {
300        let mut stats = BootBlameStats::new();
301        stats.insert("foo.service".to_string(), 12.3);
302        stats.insert("bar.service".to_string(), 45.6);
303
304        let encoded = encode_boot_blame_stats(&stats).expect("encode should succeed");
305        let decoded = decode_boot_blame_stats(&encoded).expect("decode should succeed");
306        assert_eq!(stats, decoded);
307    }
308
309    #[test]
310    fn test_boot_blame_cache_decode_invalid_payload() {
311        let invalid_payload = vec![0, 1, 2];
312        assert!(decode_boot_blame_stats(&invalid_payload).is_err());
313    }
314
315    #[tokio::test]
316    async fn test_boot_blame_cache_read_write_roundtrip() {
317        let temp_dir = tempfile::tempdir().expect("create temp dir");
318        let boot_id = "00000000-0000-0000-0000-000000000001";
319        let mut stats = BootBlameStats::new();
320        stats.insert("foo.service".to_string(), 1.25);
321
322        write_cached_boot_blame_to_dir(temp_dir.path(), boot_id, &stats)
323            .await
324            .expect("write cache");
325        let read_stats = read_cached_boot_blame_from_dir(temp_dir.path(), boot_id)
326            .await
327            .expect("read cache");
328        assert_eq!(Some(stats), read_stats);
329    }
330
331    #[tokio::test]
332    async fn test_boot_blame_cache_read_missing_file() {
333        let temp_dir = tempfile::tempdir().expect("create temp dir");
334        let missing = read_cached_boot_blame_from_dir(
335            temp_dir.path(),
336            "00000000-0000-0000-0000-000000000002",
337        )
338        .await
339        .expect("missing cache should not error");
340        assert!(missing.is_none());
341    }
342}