Skip to main content

tor_log_ratelim/
macros.rs

1//! Macros to make it easier to work with rate-limited logs
2//!
3
4/// Log a rate-limited failure message.
5///
6/// This macro looks at a single `Result<T,E>`,
7/// and keeps track of how many times the `Ok` and `Err` branches are seen.
8/// After a delay, it reports via [`tracing::event!`]
9/// how many errors it has seen since its last report.
10/// (It only reports an absence of errors
11/// when such an absence has followed
12/// an error report.)
13///
14/// ## A simple example
15///
16/// ```
17/// # use std::num::ParseIntError;
18/// # let s = "1234";
19/// # let source = "the cache";
20/// use tor_log_ratelim::log_ratelim;
21///  let r: Result<u8, ParseIntError> = s.parse();
22///
23/// log_ratelim!(
24///   // The activity we were doing.
25///   "Parsing a value from {}", source;
26///   // The result we got.
27///   r;
28/// );
29/// ```
30///
31/// This invocation could report errors like
32/// ```text
33/// WARN: Parsing a value from the cache: error (occurred 9/12 times in the last 5 minutes): number too large to fit in target type
34/// ```
35///
36/// After a while without errors, it might log:
37/// ```text
38/// WARN: Parsing a value from the cache: now working (occurred 0/100 times in th last hour)
39/// ```
40///
41/// ## Important concept: Activities
42///
43/// Every invocation of `log_ratelim!` defines a _set_ of rate limits
44/// with respect to a collection of **activities**.  
45/// Each separate **activity** value gets its own rate limit.
46/// This lets you have separate rate limits for different operations,
47/// such as connecting to different parties,
48/// or invoking different programs.
49///
50/// Typical activities might be
51/// `"Connecting to port {}", p`
52/// or
53/// `"Trying to start program {}", p`
54///
55/// (These activities should be described using a verb ending with "-ing",
56/// to make the output nice.)
57///
58/// ## Requirements on the error type.
59///
60/// The error type for each `Result` passed to this macro must implement:
61///  * [`Clone`]
62///  * [`Send`]
63///  * [`Error`](std::error::Error)
64///
65/// ## Reports are representative
66///
67/// No matter how many failures are seen between log messages,
68/// `log_ratelim!` only records and reports
69/// one error for each time it logs.
70///
71/// Its current behavior is to record and report
72/// the _first_ error for each logged interval,
73/// and discard the others.  
74/// This can lead to confusing results if the error is not representative.
75///
76/// ## Advanced syntax
77///
78/// The `log_ratelim` macro can record additional information for its
79/// representative error report,
80/// and can log information on successes as well.
81///
82/// A full invocation of `log_ratelim!` looks like this:
83///
84/// ```
85/// # use std::num::ParseIntError;
86/// # let s = "1234";
87/// # let source = "the cache";
88/// # let more_information = |_| "";
89/// use tor_log_ratelim::log_ratelim;
90/// let r: Result<u8, ParseIntError> = s.parse();
91/// log_ratelim!(
92///   "Parsing a value from {}", source;
93///   r;
94///   Err(x) => WARN, "The problem was {}", more_information(x);
95///   Ok(v) => TRACE, "Parsed {} successfully", v;
96/// );
97/// ```
98///
99/// Here the clause starting with `Err(x)`
100/// tells the logger to include a message along with the error report,
101/// and we explicitly specifies the level at which
102/// to log our failure reports.
103///
104/// Note that the error itself is **always** reported;
105/// there is no need to say
106/// `Err(e) => WARN, "{}", e`.
107/// In fact, doing so will create a redundant report of
108/// the error.
109//
110// TODO: I don't think it makes sense to have an Ok() logger.
111// Instead, we could just say
112//    log_ratelim!("Parsing a value from {}", source; r;);
113//    trace!("parsed value from {}: {:?}", source, r);
114// This is probably better, since it logs a trace for every occurrence.
115//
116/// The clause starting with `Ok(v)` tells the logger what to do on success:
117/// each individual success causes a _non-rate-limited_
118/// message at TRACE level.
119///
120/// The `Ok() ...` clause
121/// and the `Err() ...` clause
122/// are both optional.
123///
124/// Within the Err() clause,
125/// the format string and its arguments
126/// are optional.
127//
128// TODO performance notes:
129//
130// There are many opportunities for possibly making this code go faster:
131//  - Optimize the case where there is only one activity.
132//  - Use a non-string key to distinguish activities, to avoid formatting
133//    the activity string needlessly.
134//  - Limit copies (of activity and of error).
135//  - Use Event and Metadata from the tracing crate to defer formatting
136//  - Check Metadata early for the case where we don't want to report the
137//    event at all.
138//
139// Let's not pursue any of these until we know that this code actually
140// shows up in a critical path.
141#[macro_export]
142macro_rules! log_ratelim {
143  // ====
144  // Actual implementation for rate-limited logging.
145  // ====
146
147  // Nobody invokes this syntax directly; it's used as a common body for the
148  // various other syntaxes.
149  {
150    @impl activity_format: ( $act_fmt:literal $(, $act_arg:expr)* ) ;
151          result: ($result:expr ) ;
152          on_error: (Err($err_pat:pat), $err_level:ident $(, $err_fmt:literal $(, $err_arg:expr)* )? );
153          $( on_ok: (Ok($ok_pat:pat), $ok_level:ident, $ok_fmt:literal $(, $ok_arg:expr)*  ); )?
154  } => {
155    #[allow(clippy::redundant_closure_call)]
156    (||{
157    use $crate::macro_prelude::*;
158    let Some(runtime) = rt_support() else {
159      // Nobody has called `install_runtime()`: we should just log whatever
160      // happened and not worry about the rate-limiting.
161      match &$result {
162        #[allow(clippy::redundant_pattern)]
163        Err(the_error @ $err_pat) => {
164          tracing::event!(
165            tracing::Level::$err_level,
166            concat!($act_fmt, $(": ", $err_fmt, )? ": {}"),
167            $($act_arg,)*
168            $( $($err_arg, )* )?
169            the_error.report()
170          );
171        }
172        $(Ok($ok_pat) => {
173          tracing::event!(
174            tracing::Level::$ok_level,
175            $ok_fmt
176            $(, $ok_arg)*
177          );
178        })?
179        #[allow(unreachable_patterns)]
180        Ok(_) => {}
181      }
182      return;
183    };
184
185    /// An implementation of Loggable for this log message.
186    //
187    // We use a separate implementation here so that the tracing metadata will get
188    // constructed correctly.  If we called tracing::event! from a location in
189    // `tor-log-ratelim`, all the messages would appear to originate from there.
190    //
191    // (TODO: We could use tracing::Metadata explicitly, perhaps? That might be hard.)
192    struct Lg {
193        /// Internal state of the log.
194        state: LogState,
195        /// The activity status the last time we flushed.
196        last_activity: Option<Activity>,
197    }
198    impl Loggable for Lg {
199        fn flush(&mut self, summarizing: std::time::Duration) -> Activity {
200            let activity = self.state.activity();
201            match activity {
202               Activity::Active => {
203                  tracing::event!(
204                      tracing::Level::$err_level,
205                      "{}",
206                      self.state.display_problem(summarizing)
207                  );
208               }
209               Activity::AppearsResolved => {
210                   // Don't want to log an update if we're continuing to see successes and no
211                   // failures.
212                   if self.last_activity != Some(Activity::AppearsResolved) {
213                       tracing::event!(
214                           // Using err_level here is in some respects confusing:
215                           // if the _presence_ of the problem is (say) a WARN,
216                           // why should its newfound absence also be a WARN?
217                           //
218                           // We have had to decide which is worse:
219                           // that a user only watching WARNs
220                           // might not see a problem has gone away,
221                           // or that a non-problem would be reported
222                           // at an excessive severity.
223                           // We went with the latter.
224                           tracing::Level::$err_level,
225                           "{}",
226                           self.state.display_recovery(summarizing)
227                       );
228                   }
229               }
230               // There have been no new successes or failures, so there's no update to report.
231               Activity::Dormant => {}
232            }
233            self.state.reset();
234            self.last_activity = Some(activity);
235            activity
236        }
237    }
238
239    /// A lazy map from activity keys to weak RateLim handles.
240    //
241    // The strong reference for each RateLim is held by a task that flushes
242    // the logger as appropriate, and drops the strong reference once it's
243    // quiescent.
244    static LOGGERS: LazyLock<Mutex<WeakValueHashMap<String, Weak<RateLim<Lg>>>>> =
245    LazyLock::new(|| Mutex::new(WeakValueHashMap::new()));
246
247    // We assign a separate rate limit for each activity.
248    // For now, this is string-ly typed.
249    let activity = format!($act_fmt $(, $act_arg)*);
250    let key = activity.clone();
251
252    match &$result {
253      #[allow(clippy::redundant_pattern)]
254      Err(the_error @ $err_pat) => {
255        // The operation failed.
256        //
257        // 1) Create a rate-limited logger for this activity if one  did not
258        //    already exist.
259        let logger = LOGGERS
260          .lock()
261          .expect("poisoned lock")
262          .entry(key)
263          .or_insert_with(|| RateLim::new(Lg {
264              state: LogState::new(activity),
265              last_activity: None,
266          }));
267        // 2) Note failure in the activity with note_fail().
268        logger.event(runtime, |lg| lg.state.note_fail(||
269          // 2b) If this is the first time that this activity failed since the
270          //     last flush, record the formatted err_msg, and a Clone of the error.
271          (
272            $crate::log_ratelim!{@first_nonempty
273              { $( Some(format!($err_fmt $(, $err_arg)* )) )? }
274              { None }
275            },
276            Some(Box::new(the_error.clone()))
277          )
278        ));
279      }
280      Ok($crate::log_ratelim!{@first_nonempty { $($ok_pat)? } {_} }) => {
281        // The operation succeeded.
282        //
283        // 1) If this activity is tracked, call note_ok() on it.
284        if let Some(logger) = LOGGERS
285          .lock()
286          .expect("poisoned lock")
287          .get(&key) {
288            logger.nonevent(|lg| lg.state.note_ok());
289          }
290        // 2) If we have a per-success item to log, log it.
291        $(
292        tracing::event!(tracing::Level::$ok_level, $ok_fmt $(, $ok_arg )* );
293        )?
294      }
295    }
296  })()
297  };
298
299  // ======
300  // Exposed, documented syntax.
301  // ======
302
303  // Regular invocation with an Err(_) case.
304  {
305    $act_fmt:literal $(, $act_arg:expr )* $(,)? ;
306    $result:expr ;
307    Err($err_pat:pat) => $err_level:ident $(, $err_fmt:literal $(, $err_arg:expr)* )? $(,)?
308    $(; Ok($ok_pat:pat) => $ok_level:ident, $ok_fmt:literal $(, $ok_arg:expr )* $(,)?  )?
309    $(;)?
310  } => {
311    $crate::log_ratelim!{
312      @impl
313        activity_format: ( $act_fmt $(, $act_arg)* );
314        result: ($result);
315        on_error: (Err($err_pat), $err_level $(, $err_fmt $(, $err_arg)* )? );
316        $( on_ok: ( Ok($ok_pat), $ok_level, $ok_fmt $(, $ok_arg)* ); )?
317    }
318  };
319  // Regular invocation with no Err(_) case.
320  {
321    $act_fmt:literal $(, $act_arg:expr )* $(,)? ;
322    $result:expr
323    $(; Ok($ok_pat:pat) => $ok_level:ident, $ok_fmt:literal $(, $ok_arg:expr )* $(,)? )?
324    $(;)?
325  } => {
326    $crate::log_ratelim!{
327      @impl
328        activity_format: ( $act_fmt $(, $act_arg)* );
329        result: ($result);
330        on_error: (Err(_), WARN);
331        $( on_ok: ( Ok($ok_pat), $ok_level, $ok_fmt $(, $ok_arg)* ); )?
332    }
333  };
334
335  // Expand to the first of two bodies that has at least one token in it.
336  { @first_nonempty { $($a:tt)+ } { $($b:tt)* }} => { $($a)+ };
337  { @first_nonempty { } { $($b:tt)* } } => { $($b)+ };
338}
339
340#[cfg(test)]
341mod test_syntax {
342    #![allow(dead_code)]
343
344    #[derive(Clone, Debug, thiserror::Error)]
345    enum MyErr {
346        #[error("it didn't work")]
347        DidntWork,
348    }
349    impl MyErr {
350        fn badness(&self) -> u8 {
351            3
352        }
353    }
354
355    /// This doesn't actually run or test anything; it just makes sure that all
356    /// the different syntaxes work.
357    fn various_syntaxes(friend: &str, r: &Result<u32, MyErr>) {
358        log_ratelim!(
359          "saying hi to {}", friend;
360          r;
361        );
362
363        log_ratelim!(
364          "saying hi to {}", friend;
365          r;
366          Err(_) => WARN;
367        );
368
369        log_ratelim!(
370          "saying hi to {}", friend;
371          r;
372          Err(e) => WARN, "badness={}", e.badness();
373        );
374
375        log_ratelim!(
376          "saying hi to {}", friend;
377          r;
378          Ok(v) => TRACE, "nothing bad happened; v={}", v;
379        );
380
381        log_ratelim!(
382          "saying hi to {}", friend;
383          r;
384          Ok(v) => TRACE, "nothing bad happened; v={}", v;
385        );
386
387        log_ratelim!(
388          "saying hi to {}", friend;
389          r;
390          Err(e) => WARN, "badness={}", e.badness();
391          Ok(v) => TRACE, "nothing bad happened; v={}", v;
392        );
393    }
394}