monitord/
verify.rs

1//! # verify module
2//!
3//! Collects systemd unit verification errors by running `systemd-analyze verify`
4//! on all unit files and parsing the output. Tracks counts of failing units by type.
5
6use std::collections::{HashMap, HashSet};
7use std::process::Command;
8use std::sync::Arc;
9
10use thiserror::Error;
11use tokio::sync::RwLock;
12
13use crate::MachineStats;
14
15#[derive(Error, Debug)]
16pub enum MonitordVerifyError {
17    #[error("Failed to execute systemd-analyze: {0}")]
18    CommandError(String),
19    #[error("Unable to connect to D-Bus via zbus: {0:#}")]
20    ZbusError(#[from] zbus::Error),
21}
22
23/// Statistics about unit verification errors, aggregated by unit type (service, slice, timer, etc.)
24#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, Default, Eq, PartialEq)]
25pub struct VerifyStats {
26    /// Total count of units with verification failures
27    pub total: u64,
28    /// Count of failing units per type (e.g., "service", "slice", "timer")
29    /// Only includes types that have at least one failure
30    #[serde(flatten)]
31    pub by_type: HashMap<String, u64>,
32}
33
34/// Extract unit type from a unit name (e.g., "foo.service" -> "service")
35fn get_unit_type(unit_name: &str) -> Option<String> {
36    // Filter out obviously invalid unit names
37    if unit_name.len() < 3 {
38        return None;
39    }
40
41    // Check if it starts with a valid character (alphanumeric, dash, underscore, backslash for escapes)
42    let first_char = unit_name.chars().next()?;
43    if !first_char.is_alphanumeric() && first_char != '-' && first_char != '\\' {
44        return None;
45    }
46
47    unit_name.rsplit('.').next().map(|s| s.to_string())
48}
49
50/// Parse systemd-analyze verify output to extract failing unit names
51/// Output format examples:
52/// - "Unit foo.service not found."
53/// - "/path/to/foo.service:5: Unknown key..."
54/// - "foo.service: Command ... failed..."
55fn parse_verify_output(stderr: &str) -> HashSet<String> {
56    let mut failing_units = HashSet::new();
57
58    for line in stderr.lines() {
59        let trimmed = line.trim();
60        if trimmed.is_empty() {
61            continue;
62        }
63
64        // Skip "Failed to prepare filename" lines - these are input errors, not unit errors
65        if trimmed.contains("Failed to prepare filename") {
66            continue;
67        }
68
69        let mut found_in_line = false;
70
71        // Format 1: "/path/file.service:line: message" - extract just the filename
72        if line.starts_with('/') {
73            if let Some(pos) = line.find(':') {
74                let path_part = &line[..pos];
75                if let Some(filename) = path_part.rsplit('/').next() {
76                    if filename.contains('.') && get_unit_type(filename).is_some() {
77                        failing_units.insert(filename.to_string());
78                        found_in_line = true;
79                    }
80                }
81            }
82        }
83
84        // Format 2: "Unit foo.service ..." or "foo.service: ..." - only if not already found from path
85        if !found_in_line {
86            for word in line.split_whitespace() {
87                let cleaned = word.trim_end_matches(':').trim_end_matches('.');
88                // Only consider it a unit name if it has a valid extension and looks reasonable
89                if cleaned.contains('.')
90                    && cleaned.len() > 2 // Minimum reasonable length
91                    && !cleaned.contains('(') // Skip things like "foo(8)"
92                    && get_unit_type(cleaned).is_some()
93                {
94                    failing_units.insert(cleaned.to_string());
95                    break; // Only take first unit name per line
96                }
97            }
98        }
99    }
100
101    failing_units
102}
103
104/// Collect verification stats for all units in the system
105pub async fn get_verify_stats(
106    connection: &zbus::Connection,
107    allowlist: &HashSet<String>,
108    blocklist: &HashSet<String>,
109) -> Result<VerifyStats, MonitordVerifyError> {
110    let mut stats = VerifyStats::default();
111
112    // Get list of all units from systemd
113    let manager_proxy = crate::dbus::zbus_systemd::ManagerProxy::builder(connection)
114        .cache_properties(zbus::proxy::CacheProperties::No)
115        .build()
116        .await?;
117    let all_units = manager_proxy.list_units().await?;
118
119    // Filter units based on allowlist/blocklist
120    let units_to_check: Vec<String> = all_units
121        .into_iter()
122        .map(|unit| unit.0)
123        .filter(|unit_name| {
124            // Apply allowlist
125            if !allowlist.is_empty() && !allowlist.contains(unit_name) {
126                return false;
127            }
128            // Apply blocklist
129            if blocklist.contains(unit_name) {
130                return false;
131            }
132            true
133        })
134        .collect();
135
136    if units_to_check.is_empty() {
137        return Ok(stats);
138    }
139
140    // Run systemd-analyze verify on all units at once for better performance
141    let output = tokio::task::spawn_blocking(move || {
142        let mut cmd = Command::new("systemd-analyze");
143        cmd.arg("verify");
144        for unit_name in &units_to_check {
145            cmd.arg(unit_name);
146        }
147        cmd.output()
148    })
149    .await
150    .map_err(|e| MonitordVerifyError::CommandError(e.to_string()))?
151    .map_err(|e| MonitordVerifyError::CommandError(e.to_string()))?;
152
153    // Parse stderr for failing units
154    let stderr = String::from_utf8_lossy(&output.stderr);
155    let failing_units = parse_verify_output(&stderr);
156
157    // Count failures by type
158    for unit_name in failing_units {
159        stats.total += 1;
160
161        if let Some(unit_type) = get_unit_type(&unit_name) {
162            *stats.by_type.entry(unit_type).or_insert(0) += 1;
163        }
164    }
165
166    Ok(stats)
167}
168
169/// Async wrapper that updates verify stats when passed a locked struct
170pub async fn update_verify_stats(
171    connection: zbus::Connection,
172    locked_machine_stats: Arc<RwLock<MachineStats>>,
173    allowlist: HashSet<String>,
174    blocklist: HashSet<String>,
175) -> anyhow::Result<()> {
176    let verify_stats = get_verify_stats(&connection, &allowlist, &blocklist)
177        .await
178        .map_err(|e| anyhow::anyhow!("Error getting verify stats: {:?}", e))?;
179
180    let mut machine_stats = locked_machine_stats.write().await;
181    machine_stats.verify_stats = Some(verify_stats);
182    Ok(())
183}
184
185#[cfg(test)]
186mod tests {
187    use super::*;
188
189    #[test]
190    fn test_get_unit_type() {
191        assert_eq!(get_unit_type("foo.service"), Some("service".to_string()));
192        assert_eq!(get_unit_type("bar.slice"), Some("slice".to_string()));
193        assert_eq!(get_unit_type("baz.timer"), Some("timer".to_string()));
194        assert_eq!(get_unit_type("test"), Some("test".to_string()));
195    }
196
197    #[test]
198    fn test_verify_stats_default() {
199        let stats = VerifyStats::default();
200        assert_eq!(stats.total, 0);
201        assert_eq!(stats.by_type.len(), 0);
202    }
203
204    #[test]
205    fn test_parse_verify_output() {
206        let stderr = r#"
207/usr/lib/systemd/system/foo.service:4: Unknown section 'Service'. Ignoring.
208bar.slice: Command /bin/foo is not executable: No such file or directory
209Unit baz.timer not found.
210test-with-error.target: Some error message here
211"#;
212        let failing = parse_verify_output(stderr);
213        // Debug output
214        let mut sorted: Vec<_> = failing.iter().collect();
215        sorted.sort();
216        for unit in &sorted {
217            eprintln!("Found unit: {}", unit);
218        }
219
220        assert!(failing.contains("foo.service"));
221        assert!(failing.contains("bar.slice"));
222        assert!(failing.contains("baz.timer"));
223        assert!(failing.contains("test-with-error.target"));
224        assert_eq!(failing.len(), 4);
225    }
226}