1use std::array::TryFromSliceError;
7use std::collections::HashMap;
8use std::io::ErrorKind;
9use std::num::TryFromIntError;
10use std::path::{Path, PathBuf};
11use std::sync::Arc;
12
13use anyhow::Result;
14use tokio::sync::RwLock;
15use tracing::debug;
16use zbus::zvariant::ObjectPath;
17
18use crate::config::Config;
19use crate::dbus::zbus_systemd::ManagerProxy;
20use crate::dbus::zbus_unit::UnitProxy;
21use crate::MachineStats;
22
23pub type BootBlameStats = HashMap<String, f64>;
25
26const BOOT_ID_PATH: &str = "/proc/sys/kernel/random/boot_id";
27const BOOT_BLAME_CACHE_DIR: &str = "/run/monitord";
28const BOOT_BLAME_CACHE_SUFFIX: &str = "boot_blame.bin";
29
30type BootCacheResult<T> = std::result::Result<T, BootCacheError>;
31
32#[derive(Debug, thiserror::Error)]
33enum BootCacheError {
34 #[error("boot cache I/O error: {0}")]
35 Io(#[from] std::io::Error),
36 #[error("boot id from {BOOT_ID_PATH} was empty")]
37 EmptyBootId,
38 #[error("boot cache payload decode error: {0}")]
39 InvalidPayload(&'static str),
40 #[error("boot cache UTF-8 decode error: {0}")]
41 Utf8(#[from] std::string::FromUtf8Error),
42 #[error("boot cache integer conversion error: {0}")]
43 IntConversion(#[from] TryFromIntError),
44 #[error("boot cache slice conversion error: {0}")]
45 SliceConversion(#[from] TryFromSliceError),
46}
47
48fn cache_file_path(cache_dir: &Path, boot_id: &str) -> PathBuf {
49 cache_dir.join(format!("{boot_id}.{BOOT_BLAME_CACHE_SUFFIX}"))
50}
51
52async fn get_boot_id() -> BootCacheResult<String> {
53 let boot_id = tokio::fs::read_to_string(BOOT_ID_PATH).await?;
54 let boot_id = boot_id.trim().to_string();
55 if boot_id.is_empty() {
56 return Err(BootCacheError::EmptyBootId);
57 }
58 Ok(boot_id)
59}
60
61fn encode_boot_blame_stats(stats: &BootBlameStats) -> BootCacheResult<Vec<u8>> {
62 let mut out = Vec::new();
63 let entry_count = u32::try_from(stats.len())?;
64 out.extend_from_slice(&entry_count.to_le_bytes());
65
66 for (unit_name, activation_time) in stats {
67 let unit_name_bytes = unit_name.as_bytes();
68 let unit_name_len = u32::try_from(unit_name_bytes.len())?;
69 out.extend_from_slice(&unit_name_len.to_le_bytes());
70 out.extend_from_slice(unit_name_bytes);
71 out.extend_from_slice(&activation_time.to_le_bytes());
72 }
73
74 Ok(out)
75}
76
77fn decode_boot_blame_stats(content: &[u8]) -> BootCacheResult<BootBlameStats> {
78 const U32_BYTES: usize = std::mem::size_of::<u32>();
79 const F64_BYTES: usize = std::mem::size_of::<f64>();
80 fn read_u32(bytes: &[u8], offset: &mut usize) -> BootCacheResult<u32> {
81 if *offset + std::mem::size_of::<u32>() > bytes.len() {
82 return Err(BootCacheError::InvalidPayload("unexpected end of payload"));
83 }
84 let value =
85 u32::from_le_bytes(bytes[*offset..*offset + std::mem::size_of::<u32>()].try_into()?);
86 *offset += std::mem::size_of::<u32>();
87 Ok(value)
88 }
89
90 if content.len() < U32_BYTES {
91 return Err(BootCacheError::InvalidPayload("payload too small"));
92 }
93
94 let mut offset = 0usize;
95 let entry_count = read_u32(content, &mut offset)? as usize;
96 let mut stats = BootBlameStats::with_capacity(entry_count);
97
98 for _ in 0..entry_count {
99 let name_len = read_u32(content, &mut offset)? as usize;
100 if offset + name_len + F64_BYTES > content.len() {
101 return Err(BootCacheError::InvalidPayload("invalid payload size"));
102 }
103 let unit_name = String::from_utf8(content[offset..offset + name_len].to_vec())?;
104 offset += name_len;
105 let activation_time = f64::from_le_bytes(content[offset..offset + F64_BYTES].try_into()?);
106 offset += F64_BYTES;
107 stats.insert(unit_name, activation_time);
108 }
109
110 if offset != content.len() {
111 return Err(BootCacheError::InvalidPayload("trailing bytes in payload"));
112 }
113
114 Ok(stats)
115}
116
117async fn read_cached_boot_blame_from_dir(
118 cache_dir: &Path,
119 boot_id: &str,
120) -> BootCacheResult<Option<BootBlameStats>> {
121 let cache_path = cache_file_path(cache_dir, boot_id);
122 let content = match tokio::fs::read(&cache_path).await {
123 Ok(content) => content,
124 Err(err) if err.kind() == ErrorKind::NotFound => return Ok(None),
125 Err(err) => return Err(err.into()),
126 };
127 Ok(Some(decode_boot_blame_stats(&content)?))
128}
129
130async fn write_cached_boot_blame_to_dir(
131 cache_dir: &Path,
132 boot_id: &str,
133 stats: &BootBlameStats,
134) -> BootCacheResult<()> {
135 tokio::fs::create_dir_all(cache_dir).await?;
136 let cache_path = cache_file_path(cache_dir, boot_id);
137 let encoded = encode_boot_blame_stats(stats)?;
138 tokio::fs::write(cache_path, encoded).await?;
139 Ok(())
140}
141
142async fn read_cached_boot_blame(boot_id: &str) -> BootCacheResult<Option<BootBlameStats>> {
143 read_cached_boot_blame_from_dir(Path::new(BOOT_BLAME_CACHE_DIR), boot_id).await
144}
145
146async fn write_cached_boot_blame(boot_id: &str, stats: &BootBlameStats) -> BootCacheResult<()> {
147 write_cached_boot_blame_to_dir(Path::new(BOOT_BLAME_CACHE_DIR), boot_id, stats).await
148}
149
150async fn get_unit_activation_time(
153 connection: &zbus::Connection,
154 unit_path: &ObjectPath<'_>,
155) -> Result<f64> {
156 let unit_proxy = UnitProxy::builder(connection)
157 .cache_properties(zbus::proxy::CacheProperties::No)
158 .path(unit_path)?
159 .build()
160 .await?;
161
162 let inactive_exit = unit_proxy.inactive_exit_timestamp().await?;
163 let active_enter = unit_proxy.active_enter_timestamp().await?;
164
165 if inactive_exit == 0 || active_enter == 0 {
167 return Ok(0.0);
168 }
169
170 let activation_time_usec = active_enter.saturating_sub(inactive_exit);
172 let activation_time_sec = activation_time_usec as f64 / 1_000_000.0;
173
174 Ok(activation_time_sec)
175}
176
177pub async fn update_boot_blame_stats(
179 config: Arc<Config>,
180 connection: zbus::Connection,
181 machine_stats: Arc<RwLock<MachineStats>>,
182) -> Result<()> {
183 debug!("Starting boot blame stats collection");
184
185 let mut maybe_boot_id = None;
186 if config.boot_blame.cache_enabled {
187 let cached_stats = machine_stats.read().await.boot_blame.clone();
188 if cached_stats.is_some() {
189 debug!("Using in-memory cached boot blame stats");
190 return Ok(());
191 }
192
193 match get_boot_id().await {
194 Ok(boot_id) => {
195 match read_cached_boot_blame(&boot_id).await {
196 Ok(Some(cached_boot_blame)) => {
197 let cache_path = cache_file_path(Path::new(BOOT_BLAME_CACHE_DIR), &boot_id);
198 debug!(
199 "Using cached boot blame stats from {}",
200 cache_path.display()
201 );
202 machine_stats.write().await.boot_blame = Some(cached_boot_blame);
203 return Ok(());
204 }
205 Ok(None) => {
206 debug!("No cached boot blame stats found for boot id {}", boot_id);
207 }
208 Err(err) => {
209 debug!(
210 "Failed to load boot blame cache for boot id {}: {}",
211 boot_id, err
212 );
213 }
214 }
215 maybe_boot_id = Some(boot_id);
216 }
217 Err(err) => {
218 debug!("Failed to retrieve boot id for boot blame cache: {}", err);
219 }
220 }
221 }
222
223 let systemd_proxy = ManagerProxy::builder(&connection)
224 .cache_properties(zbus::proxy::CacheProperties::No)
225 .build()
226 .await?;
227 let units = systemd_proxy.list_units().await?;
228
229 let mut unit_times: Vec<(String, f64)> = Vec::new();
230
231 for unit_info in units {
233 let unit_name = unit_info.0;
234 let unit_path = unit_info.6;
235
236 if config.boot_blame.blocklist.contains(&unit_name) {
238 debug!("Skipping boot blame for {} due to blocklist", &unit_name);
239 continue;
240 }
241 if !config.boot_blame.allowlist.is_empty()
243 && !config.boot_blame.allowlist.contains(&unit_name)
244 {
245 continue;
246 }
247
248 match get_unit_activation_time(&connection, &unit_path).await {
249 Ok(time) if time > 0.0 => {
250 unit_times.push((unit_name, time));
251 }
252 Ok(_) => {
253 }
255 Err(e) => {
256 debug!("Failed to get activation time for {}: {}", unit_name, e);
257 }
258 }
259 }
260
261 unit_times.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
263
264 let num_slowest = config.boot_blame.num_slowest_units as usize;
266 unit_times.truncate(num_slowest);
267
268 let boot_blame_stats: BootBlameStats = unit_times.into_iter().collect();
270
271 debug!("Collected {} boot blame stats", boot_blame_stats.len());
272
273 let mut stats = machine_stats.write().await;
275 stats.boot_blame = Some(boot_blame_stats);
276 if config.boot_blame.cache_enabled {
277 if let Some(boot_id) = maybe_boot_id {
278 if let Some(cached_stats) = stats.boot_blame.as_ref() {
279 if let Err(err) = write_cached_boot_blame(&boot_id, cached_stats).await {
280 debug!(
281 "Failed to write boot blame cache for boot id {} to {}: {}",
282 boot_id, BOOT_BLAME_CACHE_DIR, err
283 );
284 } else {
285 debug!("Updated boot blame cache for boot id {}", boot_id);
286 }
287 }
288 }
289 }
290
291 Ok(())
292}
293
294#[cfg(test)]
295mod tests {
296 use super::*;
297
298 #[test]
299 fn test_boot_blame_cache_encode_decode_roundtrip() {
300 let mut stats = BootBlameStats::new();
301 stats.insert("foo.service".to_string(), 12.3);
302 stats.insert("bar.service".to_string(), 45.6);
303
304 let encoded = encode_boot_blame_stats(&stats).expect("encode should succeed");
305 let decoded = decode_boot_blame_stats(&encoded).expect("decode should succeed");
306 assert_eq!(stats, decoded);
307 }
308
309 #[test]
310 fn test_boot_blame_cache_decode_invalid_payload() {
311 let invalid_payload = vec![0, 1, 2];
312 assert!(decode_boot_blame_stats(&invalid_payload).is_err());
313 }
314
315 #[tokio::test]
316 async fn test_boot_blame_cache_read_write_roundtrip() {
317 let temp_dir = tempfile::tempdir().expect("create temp dir");
318 let boot_id = "00000000-0000-0000-0000-000000000001";
319 let mut stats = BootBlameStats::new();
320 stats.insert("foo.service".to_string(), 1.25);
321
322 write_cached_boot_blame_to_dir(temp_dir.path(), boot_id, &stats)
323 .await
324 .expect("write cache");
325 let read_stats = read_cached_boot_blame_from_dir(temp_dir.path(), boot_id)
326 .await
327 .expect("read cache");
328 assert_eq!(Some(stats), read_stats);
329 }
330
331 #[tokio::test]
332 async fn test_boot_blame_cache_read_missing_file() {
333 let temp_dir = tempfile::tempdir().expect("create temp dir");
334 let missing = read_cached_boot_blame_from_dir(
335 temp_dir.path(),
336 "00000000-0000-0000-0000-000000000002",
337 )
338 .await
339 .expect("missing cache should not error");
340 assert!(missing.is_none());
341 }
342}