background_hang_monitor/
background_hang_monitor.rs

1/* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at https://mozilla.org/MPL/2.0/. */
4
5use std::collections::VecDeque;
6use std::thread::{self, Builder, JoinHandle};
7use std::time::{Duration, Instant};
8
9use background_hang_monitor_api::{
10    BackgroundHangMonitor, BackgroundHangMonitorClone, BackgroundHangMonitorControlMsg,
11    BackgroundHangMonitorExitSignal, BackgroundHangMonitorRegister, HangAlert, HangAnnotation,
12    HangMonitorAlert, MonitoredComponentId,
13};
14use base::generic_channel::{GenericReceiver, GenericSender, RoutedReceiver};
15use crossbeam_channel::{Receiver, Sender, after, never, select, unbounded};
16use log::{error, warn};
17use rustc_hash::FxHashMap;
18
19use crate::SamplerImpl;
20use crate::sampler::{NativeStack, Sampler};
21
22#[derive(Clone)]
23pub struct HangMonitorRegister {
24    sender: MonitoredComponentSender,
25    monitoring_enabled: bool,
26}
27
28impl HangMonitorRegister {
29    /// Start a new hang monitor worker, and return a handle to register components for monitoring,
30    /// as well as a join handle on the worker thread.
31    pub fn init(
32        constellation_chan: GenericSender<HangMonitorAlert>,
33        control_port: GenericReceiver<BackgroundHangMonitorControlMsg>,
34        monitoring_enabled: bool,
35    ) -> (Box<dyn BackgroundHangMonitorRegister>, JoinHandle<()>) {
36        let (sender, port) = unbounded();
37
38        let join_handle = Builder::new()
39            .name("BackgroundHangMonitor".to_owned())
40            .spawn(move || {
41                let mut monitor = BackgroundHangMonitorWorker::new(
42                    constellation_chan,
43                    control_port,
44                    port,
45                    monitoring_enabled,
46                );
47                while monitor.run() {
48                    // Monitoring until all senders have been dropped...
49                }
50            })
51            .expect("Couldn't start BHM worker.");
52        (
53            Box::new(HangMonitorRegister {
54                sender,
55                monitoring_enabled,
56            }),
57            join_handle,
58        )
59    }
60}
61
62impl BackgroundHangMonitorRegister for HangMonitorRegister {
63    /// Register a component for monitoring.
64    /// Returns a dedicated wrapper around a sender
65    /// to be used for communication with the hang monitor worker.
66    fn register_component(
67        &self,
68        component_id: MonitoredComponentId,
69        transient_hang_timeout: Duration,
70        permanent_hang_timeout: Duration,
71        exit_signal: Box<dyn BackgroundHangMonitorExitSignal>,
72    ) -> Box<dyn BackgroundHangMonitor> {
73        let bhm_chan = BackgroundHangMonitorChan::new(
74            self.sender.clone(),
75            component_id,
76            self.monitoring_enabled,
77        );
78
79        bhm_chan.send(MonitoredComponentMsg::Register(
80            SamplerImpl::default(),
81            thread::current().name().map(str::to_owned),
82            transient_hang_timeout,
83            permanent_hang_timeout,
84            exit_signal,
85        ));
86        Box::new(bhm_chan)
87    }
88}
89
90impl BackgroundHangMonitorClone for HangMonitorRegister {
91    fn clone_box(&self) -> Box<dyn BackgroundHangMonitorRegister> {
92        Box::new(self.clone())
93    }
94}
95
96/// Messages sent from monitored components to the monitor.
97enum MonitoredComponentMsg {
98    /// Register component for monitoring,
99    Register(
100        SamplerImpl,
101        Option<String>,
102        Duration,
103        Duration,
104        Box<dyn BackgroundHangMonitorExitSignal>,
105    ),
106    /// Unregister component for monitoring.
107    Unregister,
108    /// Notify start of new activity for a given component,
109    NotifyActivity(HangAnnotation),
110    /// Notify start of waiting for a new task to come-in for processing.
111    NotifyWait,
112}
113
114/// A wrapper around a sender to the monitor,
115/// which will send the Id of the monitored component along with each message,
116/// and keep track of whether the monitor is still listening on the other end.
117struct BackgroundHangMonitorChan {
118    sender: MonitoredComponentSender,
119    component_id: MonitoredComponentId,
120    monitoring_enabled: bool,
121}
122
123impl BackgroundHangMonitorChan {
124    fn new(
125        sender: MonitoredComponentSender,
126        component_id: MonitoredComponentId,
127        monitoring_enabled: bool,
128    ) -> Self {
129        BackgroundHangMonitorChan {
130            sender,
131            component_id,
132            monitoring_enabled,
133        }
134    }
135
136    fn send(&self, msg: MonitoredComponentMsg) {
137        self.sender
138            .send((self.component_id.clone(), msg))
139            .expect("BHM is gone");
140    }
141}
142
143impl BackgroundHangMonitor for BackgroundHangMonitorChan {
144    fn notify_activity(&self, annotation: HangAnnotation) {
145        if self.monitoring_enabled {
146            let msg = MonitoredComponentMsg::NotifyActivity(annotation);
147            self.send(msg);
148        }
149    }
150    fn notify_wait(&self) {
151        if self.monitoring_enabled {
152            let msg = MonitoredComponentMsg::NotifyWait;
153            self.send(msg);
154        }
155    }
156    fn unregister(&self) {
157        let msg = MonitoredComponentMsg::Unregister;
158        self.send(msg);
159    }
160}
161
162struct MonitoredComponent {
163    sampler: SamplerImpl,
164    last_activity: Instant,
165    last_annotation: Option<HangAnnotation>,
166    transient_hang_timeout: Duration,
167    permanent_hang_timeout: Duration,
168    sent_transient_alert: bool,
169    sent_permanent_alert: bool,
170    is_waiting: bool,
171    exit_signal: Box<dyn BackgroundHangMonitorExitSignal>,
172}
173
174struct Sample(MonitoredComponentId, Instant, NativeStack);
175
176struct BackgroundHangMonitorWorker {
177    component_names: FxHashMap<MonitoredComponentId, String>,
178    monitored_components: FxHashMap<MonitoredComponentId, MonitoredComponent>,
179    constellation_chan: GenericSender<HangMonitorAlert>,
180    port: Receiver<(MonitoredComponentId, MonitoredComponentMsg)>,
181    control_port: Option<RoutedReceiver<BackgroundHangMonitorControlMsg>>,
182    sampling_duration: Option<Duration>,
183    sampling_max_duration: Option<Duration>,
184    last_sample: Instant,
185    creation: Instant,
186    sampling_baseline: Instant,
187    samples: VecDeque<Sample>,
188    monitoring_enabled: bool,
189    shutting_down: bool,
190}
191
192type MonitoredComponentSender = Sender<(MonitoredComponentId, MonitoredComponentMsg)>;
193type MonitoredComponentReceiver = Receiver<(MonitoredComponentId, MonitoredComponentMsg)>;
194
195impl BackgroundHangMonitorWorker {
196    fn new(
197        constellation_chan: GenericSender<HangMonitorAlert>,
198        control_port: GenericReceiver<BackgroundHangMonitorControlMsg>,
199        port: MonitoredComponentReceiver,
200        monitoring_enabled: bool,
201    ) -> Self {
202        let control_port = control_port.route_preserving_errors();
203        Self {
204            component_names: Default::default(),
205            monitored_components: Default::default(),
206            constellation_chan,
207            port,
208            control_port: Some(control_port),
209            sampling_duration: None,
210            sampling_max_duration: None,
211            last_sample: Instant::now(),
212            sampling_baseline: Instant::now(),
213            creation: Instant::now(),
214            samples: Default::default(),
215            monitoring_enabled,
216            shutting_down: Default::default(),
217        }
218    }
219
220    fn finish_sampled_profile(&mut self) {
221        let mut bytes = vec![];
222        bytes.extend(
223            format!(
224                "{{ \"rate\": {}, \"start\": {}, \"data\": [\n",
225                self.sampling_duration.unwrap().as_millis(),
226                (self.sampling_baseline - self.creation).as_millis(),
227            )
228            .as_bytes(),
229        );
230
231        let mut first = true;
232        let to_resolve = self.samples.len();
233        for (i, Sample(id, instant, stack)) in self.samples.drain(..).enumerate() {
234            println!("Resolving {}/{}", i + 1, to_resolve);
235            let profile = stack.to_hangprofile();
236            let name = match self.component_names.get(&id) {
237                Some(ref s) => format!("\"{}\"", s),
238                None => "null".to_string(),
239            };
240            let json = format!(
241                "{}{{ \"name\": {}, \"event loop id\": \"{:?}\", \
242                 \"time\": {}, \"frames\": {} }}",
243                if !first { ",\n" } else { "" },
244                name,
245                id,
246                (instant - self.sampling_baseline).as_millis(),
247                serde_json::to_string(&profile.backtrace).unwrap(),
248            );
249            bytes.extend(json.as_bytes());
250            first = false;
251        }
252
253        bytes.extend(b"\n] }");
254        let _ = self
255            .constellation_chan
256            .send(HangMonitorAlert::Profile(bytes));
257    }
258
259    fn run(&mut self) -> bool {
260        let tick = if let Some(duration) = self.sampling_duration {
261            let duration = duration
262                .checked_sub(Instant::now() - self.last_sample)
263                .unwrap_or_else(|| Duration::from_millis(0));
264            after(duration)
265        } else if self.monitoring_enabled {
266            after(Duration::from_millis(100))
267        } else {
268            never()
269        };
270
271        // Helper enum to collect messages from different ports depending
272        // on the BHM state.
273        enum BhmMessage {
274            ComponentMessage((MonitoredComponentId, MonitoredComponentMsg)),
275            ToggleSampler(Duration, Duration),
276            Exit,
277            ControlError(String),
278            ControlDisconnected,
279            Tick,
280            PortDisconnected,
281        }
282
283        let result = if let Some(ref control_port) = self.control_port {
284            select! {
285                recv(self.port) -> event => {
286                    match event {
287                        Ok(msg) => BhmMessage::ComponentMessage(msg),
288                        Err(_) => BhmMessage::PortDisconnected,
289                    }
290                },
291                recv(control_port) -> event => {
292                    match event {
293                        Ok(Ok(BackgroundHangMonitorControlMsg::ToggleSampler(rate, max_duration))) => {
294                            BhmMessage::ToggleSampler(rate, max_duration)
295                        },
296                        Ok(Ok(BackgroundHangMonitorControlMsg::Exit)) => BhmMessage::Exit,
297                        Ok(Err(e)) => BhmMessage::ControlError(format!("{e:?}")),
298                        Err(_) => BhmMessage::ControlDisconnected,
299                    }
300                }
301                recv(tick) -> _ => BhmMessage::Tick,
302            }
303        } else {
304            // control_port is already disconnected, just wait on port and tick
305            select! {
306                recv(self.port) -> event => {
307                    match event {
308                        Ok(msg) => BhmMessage::ComponentMessage(msg),
309                        Err(_) => BhmMessage::PortDisconnected,
310                    }
311                },
312                recv(tick) -> _ => BhmMessage::Tick,
313            }
314        };
315
316        match result {
317            BhmMessage::PortDisconnected => {
318                // All senders have dropped,
319                // which means all monitored components have shut down,
320                // and so we can as well.
321                return false;
322            },
323            BhmMessage::ToggleSampler(rate, max_duration) => {
324                if self.sampling_duration.is_some() {
325                    println!("Enabling profiler.");
326                    self.finish_sampled_profile();
327                    self.sampling_duration = None;
328                } else {
329                    println!("Disabling profiler.");
330                    self.sampling_duration = Some(rate);
331                    self.sampling_max_duration = Some(max_duration);
332                    self.sampling_baseline = Instant::now();
333                }
334            },
335            BhmMessage::Exit => {
336                for component in self.monitored_components.values_mut() {
337                    component.exit_signal.signal_to_exit();
338                }
339
340                // Note the start of shutdown to ensure exit propagates,
341                // even to components that have yet to register themselves,
342                // from this point on.
343                self.shutting_down = true;
344            },
345            BhmMessage::ControlError(e) => {
346                warn!("BackgroundHangMonitorWorker control message deserialization error: {e}");
347            },
348            BhmMessage::ControlDisconnected => {
349                // The control port has disconnected. This can happen during
350                // shutdown when the EventLoop drops before all script threads
351                // have finished. Instead of exiting immediately, we signal
352                // all components to exit and continue running until they all
353                // unregister (indicated by `self.port` disconnecting).
354                if !self.shutting_down {
355                    error!("BHM control disconnected before shutting down!");
356                    for component in self.monitored_components.values_mut() {
357                        component.exit_signal.signal_to_exit();
358                    }
359                    self.shutting_down = true;
360                }
361                self.control_port = None;
362            },
363            BhmMessage::ComponentMessage(msg) => {
364                self.handle_msg(msg);
365                while let Ok(another_msg) = self.port.try_recv() {
366                    // Handle any other incoming messages before performing a hang checkpoint.
367                    self.handle_msg(another_msg);
368                }
369            },
370            BhmMessage::Tick => {
371                // Just proceed to checkpoint
372            },
373        }
374
375        if let Some(duration) = self.sampling_duration {
376            let now = Instant::now();
377            if now - self.last_sample > duration {
378                self.sample();
379                self.last_sample = now;
380            }
381        } else {
382            self.perform_a_hang_monitor_checkpoint();
383        }
384        true
385    }
386
387    fn handle_msg(&mut self, msg: (MonitoredComponentId, MonitoredComponentMsg)) {
388        match msg {
389            (
390                component_id,
391                MonitoredComponentMsg::Register(
392                    sampler,
393                    name,
394                    transient_hang_timeout,
395                    permanent_hang_timeout,
396                    exit_signal,
397                ),
398            ) => {
399                // If we are shutting down,
400                // propagate it to the component,
401                // and register it(the component will unregister itself
402                // as part of handling the exit).
403                if self.shutting_down {
404                    exit_signal.signal_to_exit();
405                }
406
407                let component = MonitoredComponent {
408                    sampler,
409                    last_activity: Instant::now(),
410                    last_annotation: None,
411                    transient_hang_timeout,
412                    permanent_hang_timeout,
413                    sent_transient_alert: false,
414                    sent_permanent_alert: false,
415                    is_waiting: true,
416                    exit_signal,
417                };
418                if let Some(name) = name {
419                    self.component_names.insert(component_id.clone(), name);
420                }
421                assert!(
422                    self.monitored_components
423                        .insert(component_id, component)
424                        .is_none(),
425                    "This component was already registered for monitoring."
426                );
427            },
428            (component_id, MonitoredComponentMsg::Unregister) => {
429                self.monitored_components
430                    .remove_entry(&component_id)
431                    .expect("Received Unregister for an unknown component");
432            },
433            (component_id, MonitoredComponentMsg::NotifyActivity(annotation)) => {
434                let component = self
435                    .monitored_components
436                    .get_mut(&component_id)
437                    .expect("Received NotifyActivity for an unknown component");
438                component.last_activity = Instant::now();
439                component.last_annotation = Some(annotation);
440                component.sent_transient_alert = false;
441                component.sent_permanent_alert = false;
442                component.is_waiting = false;
443            },
444            (component_id, MonitoredComponentMsg::NotifyWait) => {
445                let component = self
446                    .monitored_components
447                    .get_mut(&component_id)
448                    .expect("Received NotifyWait for an unknown component");
449                component.last_activity = Instant::now();
450                component.sent_transient_alert = false;
451                component.sent_permanent_alert = false;
452                component.is_waiting = true;
453            },
454        }
455    }
456
457    fn perform_a_hang_monitor_checkpoint(&mut self) {
458        for (component_id, monitored) in self.monitored_components.iter_mut() {
459            if monitored.is_waiting {
460                continue;
461            }
462            let last_annotation = monitored.last_annotation.unwrap();
463            if monitored.last_activity.elapsed() > monitored.permanent_hang_timeout {
464                if monitored.sent_permanent_alert {
465                    continue;
466                }
467                let profile = match monitored.sampler.suspend_and_sample_thread() {
468                    Ok(native_stack) => Some(native_stack.to_hangprofile()),
469                    Err(()) => None,
470                };
471                let _ = self
472                    .constellation_chan
473                    .send(HangMonitorAlert::Hang(HangAlert::Permanent(
474                        component_id.clone(),
475                        last_annotation,
476                        profile,
477                    )));
478                monitored.sent_permanent_alert = true;
479                continue;
480            }
481            if monitored.last_activity.elapsed() > monitored.transient_hang_timeout {
482                if monitored.sent_transient_alert {
483                    continue;
484                }
485                let _ = self
486                    .constellation_chan
487                    .send(HangMonitorAlert::Hang(HangAlert::Transient(
488                        component_id.clone(),
489                        last_annotation,
490                    )));
491                monitored.sent_transient_alert = true;
492            }
493        }
494    }
495
496    fn sample(&mut self) {
497        for (component_id, monitored) in self.monitored_components.iter_mut() {
498            let instant = Instant::now();
499            if let Ok(stack) = monitored.sampler.suspend_and_sample_thread() {
500                if self.sampling_baseline.elapsed() >
501                    self.sampling_max_duration
502                        .expect("Max duration has been set")
503                {
504                    // Buffer is full, start discarding older samples.
505                    self.samples.pop_front();
506                }
507                self.samples
508                    .push_back(Sample(component_id.clone(), instant, stack));
509            }
510        }
511    }
512}