Skip to main content

monitord/
units.rs

1//! # units module
2//!
3//! All main systemd unit statistics. Counts of types of units, unit states and
4//! queued jobs. We also house service specific statistics and system unit states.
5
6use std::collections::HashMap;
7use std::str::FromStr;
8use std::sync::Arc;
9use std::time::Instant;
10use std::time::SystemTime;
11use std::time::UNIX_EPOCH;
12
13use struct_field_names_as_array::FieldNamesAsArray;
14use thiserror::Error;
15use tokio::sync::RwLock;
16use tracing::debug;
17use tracing::error;
18use tracing::warn;
19use zbus::zvariant::ObjectPath;
20use zbus::zvariant::OwnedObjectPath;
21
22#[derive(Error, Debug)]
23pub enum MonitordUnitsError {
24    #[error("Units D-Bus error: {0}")]
25    ZbusError(#[from] zbus::Error),
26    #[error("Integer conversion error: {0}")]
27    IntConversion(#[from] std::num::TryFromIntError),
28    #[error("System time error: {0}")]
29    SystemTimeError(#[from] std::time::SystemTimeError),
30}
31
32use crate::timer::TimerStats;
33use crate::MachineStats;
34
35// Re-export the enums and function from unit_constants for backwards compatibility
36pub use crate::unit_constants::is_unit_unhealthy;
37pub use crate::unit_constants::is_unit_unhealthy_for_service;
38pub use crate::unit_constants::SystemdUnitActiveState;
39pub use crate::unit_constants::SystemdUnitLoadState;
40pub use crate::unit_constants::SYSTEMD_SERVICE_SUFFIX;
41
42/// Inner timing breakdown for the units collector D-Bus phases.
43///
44/// Helps locate which step of unit collection dominates wall time when the
45/// `units` collector is the slowest one in `MonitordStats::collector_timings`.
46#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, Default, PartialEq)]
47pub struct UnitsCollectionTimings {
48    /// Time for the systemd ListUnits D-Bus call (one batched call returning all units).
49    pub list_units_ms: f64,
50    /// Time for filesystem unit file stats collection (runs concurrently with list_units).
51    pub unit_files_ms: f64,
52    /// Time spent in the per-unit parse loop, including any per-unit D-Bus calls
53    /// (timer property fetches, state stats, service stats).
54    pub per_unit_loop_ms: f64,
55    /// Number of timer units whose properties were fetched via D-Bus this run.
56    pub timer_dbus_fetches: u64,
57    /// Number of unit state D-Bus fetches this run (when state_stats_time_in_state is enabled).
58    pub state_dbus_fetches: u64,
59    /// Number of per-service D-Bus property fetches this run.
60    pub service_dbus_fetches: u64,
61}
62
63/// Unit file counts for a scope (root or user), broken down by unit type.
64#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, Default, PartialEq)]
65pub struct UnitFilesScope {
66    /// Generated unit files by type (e.g. "service" => 2, "mount" => 5)
67    pub generated: HashMap<String, u64>,
68    /// Transient unit files by type (e.g. "service" => 10, "scope" => 6)
69    pub transient: HashMap<String, u64>,
70}
71
72/// Unit file statistics collected from the filesystem for root and user scopes.
73#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, Default, PartialEq)]
74pub struct UnitFilesStats {
75    pub root: UnitFilesScope,
76    pub user: UnitFilesScope,
77}
78
79#[derive(
80    serde::Serialize, serde::Deserialize, Clone, Debug, Default, FieldNamesAsArray, PartialEq,
81)]
82
83/// Aggregated systemd unit statistics: counts by type, load state, active state,
84/// plus optional per-service and per-timer detailed metrics
85pub struct SystemdUnitStats {
86    /// Number of units in the "activating" state (in the process of being started)
87    pub activating_units: u64,
88    /// Number of units in the "active" state (currently started and running)
89    pub active_units: u64,
90    /// Number of automount units (on-demand filesystem mount points)
91    pub automount_units: u64,
92    /// Number of device units (kernel devices exposed to systemd by udev)
93    pub device_units: u64,
94    /// Number of units in the "failed" state (exited with error, crashed, or timed out)
95    pub failed_units: u64,
96    /// Number of units in the "inactive" state (not currently running)
97    pub inactive_units: u64,
98    /// Number of pending jobs queued in the systemd job scheduler
99    pub jobs_queued: u64,
100    /// Number of units whose unit file has been successfully loaded into memory
101    pub loaded_units: u64,
102    /// Number of units whose unit file is masked (symlinked to /dev/null, cannot be started)
103    pub masked_units: u64,
104    /// Number of mount units (filesystem mount points managed by systemd)
105    pub mount_units: u64,
106    /// Number of units whose unit file could not be found on disk
107    pub not_found_units: u64,
108    /// Number of path units (file/directory watch triggers)
109    pub path_units: u64,
110    /// Number of scope units (externally created process groups, e.g. user sessions)
111    pub scope_units: u64,
112    /// Number of service units (daemon/process lifecycle management)
113    pub service_units: u64,
114    /// Number of slice units (resource management groups in the cgroup hierarchy)
115    pub slice_units: u64,
116    /// Number of socket units (IPC/network socket activation endpoints)
117    pub socket_units: u64,
118    /// Number of target units (synchronization points for grouping units)
119    pub target_units: u64,
120    /// Number of timer units (calendar/monotonic scheduled triggers)
121    pub timer_units: u64,
122    /// Number of timer units with Persistent=yes (triggers missed runs after downtime)
123    pub timer_persistent_units: u64,
124    /// Number of timer units with RemainAfterElapse=yes (stays loaded after firing)
125    pub timer_remain_after_elapse: u64,
126    /// Total number of units known to systemd (all types, all states)
127    pub total_units: u64,
128    /// Unit file statistics from the filesystem (e.g. generator output counts)
129    pub unit_files: UnitFilesStats,
130    /// Per-service detailed metrics keyed by unit name (e.g. "sshd.service")
131    pub service_stats: HashMap<String, ServiceStats>,
132    /// Per-timer detailed metrics keyed by unit name (e.g. "logrotate.timer")
133    pub timer_stats: HashMap<String, TimerStats>,
134    /// Per-unit active/load state tracking keyed by unit name
135    pub unit_states: HashMap<String, UnitStates>,
136    /// Inner timing breakdown for this collector. Zero-valued before the first
137    /// run completes or when the varlink path is taken.
138    pub collection_timings: UnitsCollectionTimings,
139}
140
141/// Per-service metrics from the org.freedesktop.systemd1.Service and Unit D-Bus interfaces.
142/// Ref: <https://www.freedesktop.org/software/systemd/man/org.freedesktop.systemd1.html>
143#[derive(
144    serde::Serialize, serde::Deserialize, Clone, Debug, Default, Eq, FieldNamesAsArray, PartialEq,
145)]
146pub struct ServiceStats {
147    /// Realtime timestamp (usec since epoch) when the unit most recently entered the active state
148    pub active_enter_timestamp: u64,
149    /// Realtime timestamp (usec since epoch) when the unit most recently left the active state
150    pub active_exit_timestamp: u64,
151    /// Total CPU time consumed by this service's cgroup in nanoseconds
152    pub cpuusage_nsec: u64,
153    /// Realtime timestamp (usec since epoch) when the unit most recently left the inactive state
154    pub inactive_exit_timestamp: u64,
155    /// Total bytes read from block I/O by this service's cgroup
156    pub ioread_bytes: u64,
157    /// Total number of block I/O read operations by this service's cgroup
158    pub ioread_operations: u64,
159    /// Memory available to the service (MemoryAvailable from cgroup), in bytes
160    pub memory_available: u64,
161    /// Current memory usage of the service's cgroup in bytes
162    pub memory_current: u64,
163    /// Number of times systemd has restarted this service (automatic restarts)
164    pub nrestarts: u32,
165    /// Current number of processes in this service's cgroup
166    pub processes: u32,
167    /// Configured restart delay for this service in microseconds (RestartUSec)
168    pub restart_usec: u64,
169    /// Realtime timestamp (usec since epoch) of the most recent state change of any kind
170    pub state_change_timestamp: u64,
171    /// errno-style exit status code from the main process (0 = success)
172    pub status_errno: i32,
173    /// Current number of tasks (threads) in this service's cgroup
174    pub tasks_current: u64,
175    /// Timeout in microseconds for the cleanup of resources after the service exits
176    pub timeout_clean_usec: u64,
177    /// Watchdog timeout in microseconds; the service must ping within this interval or be killed
178    pub watchdog_usec: u64,
179}
180
181/// Per-unit state tracking combining active state, load state, and computed health.
182/// Ref: <https://www.freedesktop.org/software/systemd/man/org.freedesktop.systemd1.html>
183#[derive(
184    serde::Serialize, serde::Deserialize, Clone, Debug, Default, Eq, FieldNamesAsArray, PartialEq,
185)]
186pub struct UnitStates {
187    /// Current active state of the unit (active, inactive, failed, activating, deactivating, reloading)
188    pub active_state: SystemdUnitActiveState,
189    /// Current load state of the unit (loaded, error, masked, not_found)
190    pub load_state: SystemdUnitLoadState,
191    /// Computed health flag: true when a loaded unit is not active, or when load state is error/not_found.
192    /// Masked units are never marked unhealthy since masking is an intentional admin action.
193    /// Optional config can ignore inactive oneshot services.
194    pub unhealthy: bool,
195    /// Microseconds elapsed since the unit's most recent state change.
196    /// None when time-in-state tracking is disabled in config (expensive D-Bus lookup per unit).
197    pub time_in_state_usecs: Option<u64>,
198}
199
200// Declare state types
201// Reference: https://www.freedesktop.org/software/systemd/man/org.freedesktop.systemd1.html
202// SubState can be unit-type-specific so can't enum
203
204#[derive(Debug)]
205pub struct ListedUnit {
206    pub name: String,                      // The primary unit name
207    pub description: String,               // The human readable description
208    pub load_state: String, // The load state (i.e. whether the unit file has been loaded successfully)
209    pub active_state: String, // The active state (i.e. whether the unit is currently started or not)
210    pub sub_state: String,    // The sub state (i.e. unit type more specific state)
211    pub follow_unit: String, // A unit that is being followed in its state by this unit, if there is any, otherwise the empty string
212    pub unit_object_path: OwnedObjectPath, // The unit object path
213    pub job_id: u32, // If there is a job queued for the job unit, the numeric job id, 0 otherwise
214    pub job_type: String, // The job type as string
215    pub job_object_path: OwnedObjectPath, // The job object path
216}
217impl
218    From<(
219        String,
220        String,
221        String,
222        String,
223        String,
224        String,
225        OwnedObjectPath,
226        u32,
227        String,
228        OwnedObjectPath,
229    )> for ListedUnit
230{
231    fn from(
232        tuple: (
233            String,
234            String,
235            String,
236            String,
237            String,
238            String,
239            OwnedObjectPath,
240            u32,
241            String,
242            OwnedObjectPath,
243        ),
244    ) -> Self {
245        ListedUnit {
246            name: tuple.0,
247            description: tuple.1,
248            load_state: tuple.2,
249            active_state: tuple.3,
250            sub_state: tuple.4,
251            follow_unit: tuple.5,
252            unit_object_path: tuple.6,
253            job_id: tuple.7,
254            job_type: tuple.8,
255            job_object_path: tuple.9,
256        }
257    }
258}
259
260pub const SERVICE_FIELD_NAMES: &[&str] = &ServiceStats::FIELD_NAMES_AS_ARRAY;
261pub const UNIT_FIELD_NAMES: &[&str] = &SystemdUnitStats::FIELD_NAMES_AS_ARRAY;
262pub const UNIT_STATES_FIELD_NAMES: &[&str] = &UnitStates::FIELD_NAMES_AS_ARRAY;
263
264/// Pull out selected systemd service statistics
265async fn parse_service(
266    connection: &zbus::Connection,
267    name: &str,
268    object_path: &OwnedObjectPath,
269) -> Result<ServiceStats, MonitordUnitsError> {
270    debug!("Parsing service {} stats", name);
271
272    let sp = crate::dbus::zbus_service::ServiceProxy::builder(connection)
273        .cache_properties(zbus::proxy::CacheProperties::No)
274        .path(object_path.clone())?
275        .build()
276        .await?;
277    let up = crate::dbus::zbus_unit::UnitProxy::builder(connection)
278        .cache_properties(zbus::proxy::CacheProperties::No)
279        .path(object_path.clone())?
280        .build()
281        .await?;
282
283    // Use tokio::join! without tokio::spawn to avoid per-task allocation overhead.
284    // These all share the same D-Bus connection so spawn adds no parallelism benefit.
285    let (
286        active_enter_timestamp,
287        active_exit_timestamp,
288        cpuusage_nsec,
289        inactive_exit_timestamp,
290        ioread_bytes,
291        ioread_operations,
292        memory_current,
293        memory_available,
294        nrestarts,
295        processes,
296        restart_usec,
297        state_change_timestamp,
298        status_errno,
299        tasks_current,
300        timeout_clean_usec,
301        watchdog_usec,
302    ) = tokio::join!(
303        up.active_enter_timestamp(),
304        up.active_exit_timestamp(),
305        sp.cpuusage_nsec(),
306        up.inactive_exit_timestamp(),
307        sp.ioread_bytes(),
308        sp.ioread_operations(),
309        sp.memory_current(),
310        sp.memory_available(),
311        sp.nrestarts(),
312        sp.get_processes(),
313        sp.restart_usec(),
314        up.state_change_timestamp(),
315        sp.status_errno(),
316        sp.tasks_current(),
317        sp.timeout_clean_usec(),
318        sp.watchdog_usec(),
319    );
320
321    Ok(ServiceStats {
322        active_enter_timestamp: active_enter_timestamp?,
323        active_exit_timestamp: active_exit_timestamp?,
324        cpuusage_nsec: cpuusage_nsec?,
325        inactive_exit_timestamp: inactive_exit_timestamp?,
326        ioread_bytes: ioread_bytes?,
327        ioread_operations: ioread_operations?,
328        memory_current: memory_current?,
329        memory_available: memory_available?,
330        nrestarts: nrestarts?,
331        processes: processes?.len().try_into()?,
332        restart_usec: restart_usec?,
333        state_change_timestamp: state_change_timestamp?,
334        status_errno: status_errno?,
335        tasks_current: tasks_current?,
336        timeout_clean_usec: timeout_clean_usec?,
337        watchdog_usec: watchdog_usec?,
338    })
339}
340
341async fn get_time_in_state(
342    connection: Option<&zbus::Connection>,
343    unit: &ListedUnit,
344) -> Result<Option<u64>, MonitordUnitsError> {
345    match connection {
346        Some(c) => {
347            let up = crate::dbus::zbus_unit::UnitProxy::builder(c)
348                .cache_properties(zbus::proxy::CacheProperties::No)
349                .path(ObjectPath::from(unit.unit_object_path.clone()))?
350                .build()
351                .await?;
352            let now: u64 = SystemTime::now().duration_since(UNIX_EPOCH)?.as_secs() * 1_000_000;
353            let state_change_timestamp = match up.state_change_timestamp().await {
354                Ok(sct) => sct,
355                Err(err) => {
356                    error!(
357                        "Unable to get state_change_timestamp for {} - Setting to 0: {:?}",
358                        &unit.name, err,
359                    );
360                    0
361                }
362            };
363            Ok(Some(now - state_change_timestamp))
364        }
365        None => {
366            error!("No zbus connection passed, but time_in_state_usecs enabled");
367            Ok(None)
368        }
369    }
370}
371
372/// Parse state of a unit into our unit_states hash.
373///
374/// Returns true when an actual time-in-state D-Bus fetch was performed,
375/// so callers can keep `state_dbus_fetches` honest. Allowlist/blocklist
376/// short-circuits and `state_stats_time_in_state = false` both return false.
377pub async fn parse_state(
378    stats: &mut SystemdUnitStats,
379    unit: &ListedUnit,
380    config: &crate::config::UnitsConfig,
381    connection: Option<&zbus::Connection>,
382) -> Result<bool, MonitordUnitsError> {
383    if config.state_stats_blocklist.contains(&unit.name) {
384        debug!("Skipping state stats for {} due to blocklist", &unit.name);
385        return Ok(false);
386    }
387    if !config.state_stats_allowlist.is_empty()
388        && !config.state_stats_allowlist.contains(&unit.name)
389    {
390        return Ok(false);
391    }
392    let active_state = SystemdUnitActiveState::from_str(&unit.active_state)
393        .unwrap_or(SystemdUnitActiveState::unknown);
394    let load_state = SystemdUnitLoadState::from_str(&unit.load_state.replace('-', "_"))
395        .unwrap_or(SystemdUnitLoadState::unknown);
396    let mut is_oneshot_service = false;
397    if config.ignore_inactive_oneshot_services
398        && unit.name.ends_with(SYSTEMD_SERVICE_SUFFIX)
399        && matches!(active_state, SystemdUnitActiveState::inactive)
400        && matches!(load_state, SystemdUnitLoadState::loaded)
401    {
402        if let Some(conn) = connection {
403            match is_oneshot_service_unit(conn, unit).await {
404                Ok(is_oneshot) => is_oneshot_service = is_oneshot,
405                Err(err) => warn!(
406                    "Unable to get Service.Type for {} (assuming not oneshot): {:?}",
407                    &unit.name, err
408                ),
409            }
410        }
411    }
412
413    // Get the state_change_timestamp to determine time in usecs we've been in current state
414    let mut time_in_state_usecs: Option<u64> = None;
415    let mut did_dbus_fetch = false;
416    if config.state_stats_time_in_state {
417        time_in_state_usecs = get_time_in_state(connection, unit).await?;
418        // get_time_in_state only issues a D-Bus call when connection is Some;
419        // the None path logs an error and returns Ok(None) without calling out.
420        did_dbus_fetch = connection.is_some();
421    }
422
423    stats.unit_states.insert(
424        unit.name.clone(),
425        UnitStates {
426            active_state,
427            load_state,
428            unhealthy: is_unit_unhealthy_for_service(
429                active_state,
430                load_state,
431                is_oneshot_service,
432                config.ignore_inactive_oneshot_services,
433            ),
434            time_in_state_usecs,
435        },
436    );
437    Ok(did_dbus_fetch)
438}
439
440async fn is_oneshot_service_unit(
441    connection: &zbus::Connection,
442    unit: &ListedUnit,
443) -> Result<bool, MonitordUnitsError> {
444    let sp = crate::dbus::zbus_service::ServiceProxy::builder(connection)
445        .cache_properties(zbus::proxy::CacheProperties::No)
446        .path(ObjectPath::from(unit.unit_object_path.clone()))?
447        .build()
448        .await?;
449    Ok(sp.type_().await? == "oneshot")
450}
451
452/// Parse a unit and add to overall counts of state, type etc.
453fn parse_unit(stats: &mut SystemdUnitStats, unit: &ListedUnit) {
454    // Count unit type
455    match unit.name.rsplit('.').next() {
456        Some("automount") => stats.automount_units += 1,
457        Some("device") => stats.device_units += 1,
458        Some("mount") => stats.mount_units += 1,
459        Some("path") => stats.path_units += 1,
460        Some("scope") => stats.scope_units += 1,
461        Some("service") => stats.service_units += 1,
462        Some("slice") => stats.slice_units += 1,
463        Some("socket") => stats.socket_units += 1,
464        Some("target") => stats.target_units += 1,
465        Some("timer") => stats.timer_units += 1,
466        unknown => debug!("Found unhandled '{:?}' unit type", unknown),
467    };
468    // Count load state
469    match unit.load_state.as_str() {
470        "loaded" => stats.loaded_units += 1,
471        "masked" => stats.masked_units += 1,
472        "not-found" => stats.not_found_units += 1,
473        _ => debug!("{} is not loaded. It's {}", unit.name, unit.load_state),
474    };
475    // Count unit status
476    match unit.active_state.as_str() {
477        "activating" => stats.activating_units += 1,
478        "active" => stats.active_units += 1,
479        "failed" => stats.failed_units += 1,
480        "inactive" => stats.inactive_units += 1,
481        unknown => debug!("Found unhandled '{}' unit state", unknown),
482    };
483    // Count jobs queued
484    if unit.job_id != 0 {
485        stats.jobs_queued += 1;
486    }
487}
488
489const TRANSIENT_DIR: &str = "/run/systemd/transient";
490
491async fn count_unit_files_by_type(path: &str) -> HashMap<String, u64> {
492    let mut dir = match tokio::fs::read_dir(path).await {
493        Ok(d) => d,
494        Err(err) => {
495            debug!("Unable to read {}: {:?}", path, err);
496            return HashMap::new();
497        }
498    };
499    let mut counts = HashMap::new();
500    loop {
501        match dir.next_entry().await {
502            Ok(Some(entry)) => {
503                let file_type = match entry.file_type().await {
504                    Ok(ft) => ft,
505                    Err(_) => continue,
506                };
507                if !file_type.is_file() {
508                    continue;
509                }
510                let name = entry.file_name();
511                let unit_type = name
512                    .to_str()
513                    .and_then(|n| n.rsplit('.').next())
514                    .unwrap_or("unknown");
515                *counts.entry(unit_type.to_string()).or_insert(0) += 1;
516            }
517            Ok(None) => break,
518            Err(err) => {
519                warn!("Error reading entry in {}: {:?}", path, err);
520                continue;
521            }
522        }
523    }
524    counts
525}
526
527fn merge_counts(target: &mut HashMap<String, u64>, source: HashMap<String, u64>) {
528    for (unit_type, count) in source {
529        *target.entry(unit_type).or_insert(0) += count;
530    }
531}
532
533/// Enumerate the per-user systemd transient directories under `{fs_root}/run/user`.
534async fn enumerate_user_transient_dirs(fs_root: &str) -> Vec<String> {
535    let user_dir = format!("{fs_root}/run/user");
536    match tokio::fs::read_dir(&user_dir).await {
537        Ok(mut entries) => {
538            let mut dirs = Vec::new();
539            loop {
540                match entries.next_entry().await {
541                    Ok(Some(entry)) => {
542                        dirs.push(format!("{}/systemd/transient", entry.path().display()));
543                    }
544                    Ok(None) => break,
545                    Err(err) => {
546                        warn!("Error reading entry in {}: {:?}", user_dir, err);
547                        continue;
548                    }
549                }
550            }
551            dirs
552        }
553        Err(err) => {
554            debug!("Unable to read {}: {:?}", user_dir, err);
555            Vec::new()
556        }
557    }
558}
559
560/// Collect unit file statistics from the filesystem.
561/// `fs_root` is prepended to all paths — empty string for the host,
562/// `/proc/<pid>/root` for containers.
563///
564/// All directory reads are issued in parallel: the three generator directories,
565/// the root transient directory, and user-dir enumeration run concurrently in a
566/// first batch; per-user transient reads run concurrently in a second batch.
567pub async fn collect_unit_files_stats(fs_root: &str) -> UnitFilesStats {
568    // Pre-bind formatted paths to extend their lifetime across the join.
569    let gen_path = format!("{fs_root}/run/systemd/generator");
570    let gen_early_path = format!("{fs_root}/run/systemd/generator.early");
571    let gen_late_path = format!("{fs_root}/run/systemd/generator.late");
572    let transient_path = format!("{fs_root}{TRANSIENT_DIR}");
573
574    // First batch: fixed paths + user dir enumeration all in parallel.
575    let (gen, gen_early, gen_late, root_transient, user_dirs) = tokio::join!(
576        count_unit_files_by_type(&gen_path),
577        count_unit_files_by_type(&gen_early_path),
578        count_unit_files_by_type(&gen_late_path),
579        count_unit_files_by_type(&transient_path),
580        enumerate_user_transient_dirs(fs_root),
581    );
582
583    let mut root_generated = HashMap::new();
584    merge_counts(&mut root_generated, gen);
585    merge_counts(&mut root_generated, gen_early);
586    merge_counts(&mut root_generated, gen_late);
587
588    // Second batch: read every user transient directory in parallel.
589    let user_transient_counts =
590        futures_util::future::join_all(user_dirs.iter().map(|d| count_unit_files_by_type(d))).await;
591
592    let mut user_transient = HashMap::new();
593    for counts in user_transient_counts {
594        merge_counts(&mut user_transient, counts);
595    }
596
597    UnitFilesStats {
598        root: UnitFilesScope {
599            generated: root_generated,
600            transient: root_transient,
601        },
602        user: UnitFilesScope {
603            generated: HashMap::new(),
604            transient: user_transient,
605        },
606    }
607}
608
609/// Pull all units from dbus and count how system is setup and behaving
610pub async fn parse_unit_state(
611    config: &crate::config::Config,
612    connection: &zbus::Connection,
613    fs_root: &str,
614) -> Result<SystemdUnitStats, MonitordUnitsError> {
615    if !config.units.state_stats_allowlist.is_empty() {
616        debug!(
617            "Using unit state allowlist: {:?}",
618            config.units.state_stats_allowlist
619        );
620    }
621
622    if !config.units.state_stats_blocklist.is_empty() {
623        debug!(
624            "Using unit state blocklist: {:?}",
625            config.units.state_stats_blocklist,
626        );
627    }
628
629    let mut stats = SystemdUnitStats::default();
630
631    let p = crate::dbus::zbus_systemd::ManagerProxy::builder(connection)
632        .cache_properties(zbus::proxy::CacheProperties::No)
633        .build()
634        .await?;
635
636    // Run filesystem collection and D-Bus list_units in parallel, timing each independently.
637    let (unit_files_result, units_result) = tokio::join!(
638        async {
639            let start = Instant::now();
640            let files = if config.units.unit_files {
641                collect_unit_files_stats(fs_root).await
642            } else {
643                UnitFilesStats::default()
644            };
645            (files, start.elapsed().as_secs_f64() * 1000.0)
646        },
647        async {
648            let start = Instant::now();
649            let units = p.list_units().await;
650            (units, start.elapsed().as_secs_f64() * 1000.0)
651        },
652    );
653    let (unit_files, unit_files_ms) = unit_files_result;
654    let (units_result, list_units_ms) = units_result;
655    stats.collection_timings.unit_files_ms = unit_files_ms;
656    stats.collection_timings.list_units_ms = list_units_ms;
657    stats.unit_files = unit_files;
658
659    let units = units_result?;
660    stats.total_units = units.len() as u64;
661
662    let per_unit_loop_start = Instant::now();
663    let mut state_dbus_fetches: u64 = 0;
664    let mut service_dbus_fetches: u64 = 0;
665    let mut timer_dbus_fetches: u64 = 0;
666
667    for unit_raw in units {
668        let unit: ListedUnit = unit_raw.into();
669        // Collect unit types + states counts
670        parse_unit(&mut stats, &unit);
671
672        // Collect per unit state stats - ActiveState + LoadState
673        // Not collecting SubState (yet)
674        if config.units.state_stats {
675            let did_dbus_fetch =
676                parse_state(&mut stats, &unit, &config.units, Some(connection)).await?;
677            if did_dbus_fetch {
678                state_dbus_fetches += 1;
679            }
680        }
681
682        // Collect service stats
683        if config.services.contains(&unit.name) {
684            debug!("Collecting service stats for {:?}", &unit);
685            match parse_service(connection, &unit.name, &unit.unit_object_path).await {
686                Ok(service_stats) => {
687                    stats.service_stats.insert(unit.name.clone(), service_stats);
688                    service_dbus_fetches += 1;
689                }
690                Err(err) => error!(
691                    "Unable to get service stats for {} {}: {:#?}",
692                    &unit.name, &unit.unit_object_path, err
693                ),
694            }
695        }
696
697        // Collect timer stats
698        if config.timers.enabled && unit.name.contains(".timer") {
699            if config.timers.blocklist.contains(&unit.name) {
700                debug!("Skipping timer stats for {} due to blocklist", &unit.name);
701                continue;
702            }
703            if !config.timers.allowlist.is_empty() && !config.timers.allowlist.contains(&unit.name)
704            {
705                continue;
706            }
707            let timer_stats: Option<TimerStats> =
708                match crate::timer::collect_timer_stats(connection, &mut stats, &unit).await {
709                    Ok(ts) => {
710                        timer_dbus_fetches += 1;
711                        Some(ts)
712                    }
713                    Err(err) => {
714                        error!("Failed to get {} stats: {:#?}", &unit.name, err);
715                        None
716                    }
717                };
718            if let Some(ts) = timer_stats {
719                stats.timer_stats.insert(unit.name.clone(), ts);
720            }
721        }
722    }
723    let per_unit_loop_elapsed = per_unit_loop_start.elapsed();
724    stats.collection_timings.per_unit_loop_ms = per_unit_loop_elapsed.as_secs_f64() * 1000.0;
725    stats.collection_timings.state_dbus_fetches = state_dbus_fetches;
726    stats.collection_timings.service_dbus_fetches = service_dbus_fetches;
727    stats.collection_timings.timer_dbus_fetches = timer_dbus_fetches;
728
729    debug!("unit stats: {:?}", stats);
730    Ok(stats)
731}
732
733/// Async wrapper that can update unit stats when passed a locked struct.
734/// `fs_root` is prepended to filesystem paths for unit file stats —
735/// empty string for the host, `/proc/<pid>/root` for containers.
736pub async fn update_unit_stats(
737    config: Arc<crate::config::Config>,
738    connection: zbus::Connection,
739    locked_machine_stats: Arc<RwLock<MachineStats>>,
740    fs_root: String,
741) -> anyhow::Result<()> {
742    let mut machine_stats = locked_machine_stats.write().await;
743    match parse_unit_state(&config, &connection, &fs_root).await {
744        Ok(units_stats) => machine_stats.units = units_stats,
745        Err(err) => error!("units stats failed: {:?}", err),
746    }
747    Ok(())
748}
749
750#[cfg(test)]
751mod tests {
752    use super::*;
753    use std::collections::HashSet;
754    use strum::IntoEnumIterator;
755
756    fn get_unit_file() -> ListedUnit {
757        ListedUnit {
758            name: String::from("apport-autoreport.timer"),
759            description: String::from(
760                "Process error reports when automatic reporting is enabled (timer based)",
761            ),
762            load_state: String::from("loaded"),
763            active_state: String::from("inactive"),
764            sub_state: String::from("dead"),
765            follow_unit: String::from(""),
766            unit_object_path: ObjectPath::try_from(
767                "/org/freedesktop/systemd1/unit/apport_2dautoreport_2etimer",
768            )
769            .expect("Unable to make an object path")
770            .into(),
771            job_id: 0,
772            job_type: String::from(""),
773            job_object_path: ObjectPath::try_from("/").unwrap().into(),
774        }
775    }
776
777    #[tokio::test]
778    async fn test_state_parse() -> Result<(), MonitordUnitsError> {
779        let test_unit_name = String::from("apport-autoreport.timer");
780        let expected_stats = SystemdUnitStats {
781            activating_units: 0,
782            active_units: 0,
783            automount_units: 0,
784            device_units: 0,
785            failed_units: 0,
786            inactive_units: 0,
787            jobs_queued: 0,
788            loaded_units: 0,
789            masked_units: 0,
790            mount_units: 0,
791            not_found_units: 0,
792            path_units: 0,
793            scope_units: 0,
794            service_units: 0,
795            slice_units: 0,
796            socket_units: 0,
797            target_units: 0,
798            timer_units: 0,
799            timer_persistent_units: 0,
800            timer_remain_after_elapse: 0,
801            total_units: 0,
802            unit_files: UnitFilesStats::default(),
803            service_stats: HashMap::new(),
804            timer_stats: HashMap::new(),
805            unit_states: HashMap::from([(
806                test_unit_name.clone(),
807                UnitStates {
808                    active_state: SystemdUnitActiveState::inactive,
809                    load_state: SystemdUnitLoadState::loaded,
810                    unhealthy: true,
811                    time_in_state_usecs: None,
812                },
813            )]),
814            collection_timings: UnitsCollectionTimings::default(),
815        };
816        let mut stats = SystemdUnitStats::default();
817        let systemd_unit = get_unit_file();
818        let mut config = crate::config::UnitsConfig::default();
819
820        // Test no allow list or blocklist; with connection: None, parse_state
821        // takes the no-op path inside get_time_in_state and returns false.
822        let did_fetch = parse_state(&mut stats, &systemd_unit, &config, None).await?;
823        assert_eq!(expected_stats, stats);
824        assert!(!did_fetch);
825
826        // Create an allow list
827        config.state_stats_allowlist = HashSet::from([test_unit_name.clone()]);
828
829        // test no blocklist and only allow list - Should equal the same as no lists above
830        let mut allowlist_stats = SystemdUnitStats::default();
831        let did_fetch = parse_state(&mut allowlist_stats, &systemd_unit, &config, None).await?;
832        assert_eq!(expected_stats, allowlist_stats);
833        assert!(!did_fetch);
834
835        // Now add a blocklist
836        config.state_stats_blocklist = HashSet::from([test_unit_name]);
837
838        // test blocklist with allow list (show it's preferred)
839        let mut blocklist_stats = SystemdUnitStats::default();
840        let expected_blocklist_stats = SystemdUnitStats::default();
841        let did_fetch = parse_state(&mut blocklist_stats, &systemd_unit, &config, None).await?;
842        assert_eq!(expected_blocklist_stats, blocklist_stats);
843        // Blocklist short-circuit must NOT count as a D-Bus fetch.
844        assert!(!did_fetch);
845        Ok(())
846    }
847
848    #[test]
849    fn test_unit_parse() {
850        let expected_stats = SystemdUnitStats {
851            activating_units: 0,
852            active_units: 0,
853            automount_units: 0,
854            device_units: 0,
855            failed_units: 0,
856            inactive_units: 1,
857            jobs_queued: 0,
858            loaded_units: 1,
859            masked_units: 0,
860            mount_units: 0,
861            not_found_units: 0,
862            path_units: 0,
863            scope_units: 0,
864            service_units: 0,
865            slice_units: 0,
866            socket_units: 0,
867            target_units: 0,
868            timer_units: 1,
869            timer_persistent_units: 0,
870            timer_remain_after_elapse: 0,
871            total_units: 0,
872            unit_files: UnitFilesStats::default(),
873            service_stats: HashMap::new(),
874            timer_stats: HashMap::new(),
875            unit_states: HashMap::new(),
876            collection_timings: UnitsCollectionTimings::default(),
877        };
878        let mut stats = SystemdUnitStats::default();
879        let systemd_unit = get_unit_file();
880        parse_unit(&mut stats, &systemd_unit);
881        assert_eq!(expected_stats, stats);
882    }
883
884    #[test]
885    fn test_unit_parse_activating() {
886        let mut activating_unit = get_unit_file();
887        activating_unit.active_state = String::from("activating");
888        let mut stats = SystemdUnitStats::default();
889        parse_unit(&mut stats, &activating_unit);
890        assert_eq!(stats.activating_units, 1);
891        assert_eq!(stats.active_units, 0);
892        assert_eq!(stats.inactive_units, 0);
893    }
894
895    #[test]
896    fn test_iterators() {
897        assert!(SystemdUnitActiveState::iter().collect::<Vec<_>>().len() > 0);
898        assert!(SystemdUnitLoadState::iter().collect::<Vec<_>>().len() > 0);
899    }
900
901    #[tokio::test]
902    async fn test_count_unit_files_by_type() {
903        let dir = tempfile::tempdir().expect("Unable to create temp dir");
904        let path = dir.path();
905
906        std::fs::write(path.join("sshd.service"), "").unwrap();
907        std::fs::write(path.join("nginx.service"), "").unwrap();
908        std::fs::write(path.join("boot.mount"), "").unwrap();
909        std::fs::write(path.join("swap.swap"), "").unwrap();
910        std::fs::create_dir(path.join("multi-user.target.wants")).unwrap();
911
912        let counts = count_unit_files_by_type(path.to_str().unwrap()).await;
913        assert_eq!(counts.get("service"), Some(&2));
914        assert_eq!(counts.get("mount"), Some(&1));
915        assert_eq!(counts.get("swap"), Some(&1));
916        assert_eq!(counts.get("wants"), None);
917        assert_eq!(counts.len(), 3);
918    }
919
920    #[tokio::test]
921    async fn test_count_unit_files_by_type_nonexistent_dir() {
922        let counts = count_unit_files_by_type("/nonexistent/path").await;
923        assert!(counts.is_empty());
924    }
925
926    #[tokio::test]
927    async fn test_collect_unit_files_stats_with_fs_root() {
928        let root = tempfile::tempdir().expect("Unable to create temp dir");
929        let root_path = root.path();
930
931        let gen_dir = root_path.join("run/systemd/generator");
932        std::fs::create_dir_all(&gen_dir).unwrap();
933        std::fs::write(gen_dir.join("boot.mount"), "").unwrap();
934        std::fs::write(gen_dir.join("swap.swap"), "").unwrap();
935
936        let transient_dir = root_path.join("run/systemd/transient");
937        std::fs::create_dir_all(&transient_dir).unwrap();
938        std::fs::write(transient_dir.join("run-thing.service"), "").unwrap();
939
940        let user_transient = root_path.join("run/user/1000/systemd/transient");
941        std::fs::create_dir_all(&user_transient).unwrap();
942        std::fs::write(user_transient.join("app-code.scope"), "").unwrap();
943        std::fs::write(user_transient.join("app-term.scope"), "").unwrap();
944
945        let stats = collect_unit_files_stats(root_path.to_str().unwrap()).await;
946        assert_eq!(stats.root.generated.get("mount"), Some(&1));
947        assert_eq!(stats.root.generated.get("swap"), Some(&1));
948        assert_eq!(stats.root.transient.get("service"), Some(&1));
949        assert_eq!(stats.user.transient.get("scope"), Some(&2));
950        assert!(stats.user.generated.is_empty());
951    }
952}