Skip to main content

tor_netdoc/parse/
tokenize.rs

1//! Break a string into a set of directory-object Items.
2//!
3//! This module defines Item, which represents a basic entry in a
4//! directory document, and NetDocReader, which is used to break a
5//! string into Items.
6
7use crate::parse::keyword::Keyword;
8use crate::types::misc::FromBytes;
9use crate::util::PeekableIterator;
10use crate::{Error, NetdocErrorKind as EK, Pos, Result};
11use base64ct::{Base64, Encoding};
12use itertools::Itertools;
13use std::cell::{Ref, RefCell};
14use std::iter::Peekable;
15use std::str::FromStr;
16use tor_error::internal;
17
18/// Useful constants for netdoc object syntax
19pub(crate) mod object {
20    /// indicates the start of an object
21    pub(crate) const BEGIN_STR: &str = "-----BEGIN ";
22    /// indicates the end of an object
23    pub(crate) const END_STR: &str = "-----END ";
24    /// indicates the end of a begin or end tag.
25    pub(crate) const TAG_END: &str = "-----";
26    /// Maximum PEM base64 line length (not enforced during parsing)
27    pub(crate) const BASE64_PEM_MAX_LINE: usize = 64;
28}
29
30/// Return true iff a given character is "space" according to the rules
31/// of dir-spec.txt
32pub(crate) fn is_sp(c: char) -> bool {
33    c == ' ' || c == '\t'
34}
35/// Check that all the characters in `s` are valid base64.
36///
37/// This is not a perfect check for base64ness -- it is mainly meant
38/// to help us recover after unterminated base64.
39fn b64check(s: &str) -> Result<()> {
40    for b in s.bytes() {
41        match b {
42            b'=' => (),
43            b'a'..=b'z' => (),
44            b'A'..=b'Z' => (),
45            b'0'..=b'9' => (),
46            b'/' | b'+' => (),
47            _ => {
48                return Err(EK::BadObjectBase64.at_pos(Pos::at(s)));
49            }
50        };
51    }
52    Ok(())
53}
54
55/// A tagged object that is part of a directory Item.
56///
57/// This represents a single blob within a pair of "-----BEGIN
58/// FOO-----" and "-----END FOO-----".  The data is not guaranteed to
59/// be actual base64 when this object is created: doing so would
60/// require either that we parse the base64 twice, or that we allocate
61/// a buffer to hold the data before it's needed.
62#[derive(Clone, Copy, Debug)]
63pub(crate) struct Object<'a> {
64    /// Reference to the "tag" string (the 'foo') in the BEGIN line.
65    tag: &'a str,
66    /// Reference to the allegedly base64-encoded data.  This may or
67    /// may not actually be base64 at this point.
68    data: &'a str,
69    /// Reference to the END line for this object.  This doesn't
70    /// need to be parsed, but it's used to find where this object
71    /// ends.
72    endline: &'a str,
73}
74
75/// A single part of a directory object.
76///
77/// Each Item -- called an "entry" in dir-spec.txt -- has a keyword, a
78/// (possibly empty) set of arguments, and an optional object.
79///
80/// This is a zero-copy implementation that points to slices within a
81/// containing string.
82#[derive(Clone, Debug)]
83pub(crate) struct Item<'a, K: Keyword> {
84    /// The keyword that determines the type of this item.
85    kwd: K,
86    /// A reference to the actual string that defines the keyword for
87    /// this item.
88    kwd_str: &'a str,
89    /// Reference to the arguments that appear in the same line after the
90    /// keyword.  Does not include the terminating newline or the
91    /// space that separates the keyword for its arguments.
92    args: &'a str,
93    /// The arguments, split by whitespace.  This vector is constructed
94    /// as needed, using interior mutability.
95    split_args: RefCell<Option<Vec<&'a str>>>,
96    /// If present, a base-64-encoded object that appeared at the end
97    /// of this item.
98    object: Option<Object<'a>>,
99}
100
101/// A cursor into a string that returns Items one by one.
102///
103/// (This type isn't used directly, but is returned wrapped in a Peekable.)
104#[derive(Debug)]
105struct NetDocReaderBase<'a, K: Keyword> {
106    /// The string we're parsing.
107    s: &'a str,
108    /// Our position within the string.
109    off: usize,
110    /// Tells Rust it's okay that we are parameterizing on K.
111    _k: std::marker::PhantomData<K>,
112}
113
114impl<'a, K: Keyword> NetDocReaderBase<'a, K> {
115    /// Create a new NetDocReader to split a string into tokens.
116    fn new(s: &'a str) -> Result<Self> {
117        Ok(NetDocReaderBase {
118            s: validate_utf_8_rules(s)?,
119            off: 0,
120            _k: std::marker::PhantomData,
121        })
122    }
123    /// Return the current Pos within the string.
124    fn pos(&self, pos: usize) -> Pos {
125        Pos::from_offset(self.s, pos)
126    }
127    /// Skip forward by n bytes.
128    ///
129    /// (Note that standard caveats with byte-oriented processing of
130    /// UTF-8 strings apply.)
131    fn advance(&mut self, n: usize) -> Result<()> {
132        if n > self.remaining() {
133            return Err(
134                Error::from(internal!("tried to advance past end of document"))
135                    .at_pos(Pos::from_offset(self.s, self.off)),
136            );
137        }
138        self.off += n;
139        Ok(())
140    }
141    /// Return the remaining number of bytes in this reader.
142    fn remaining(&self) -> usize {
143        self.s.len() - self.off
144    }
145
146    /// Return true if the next characters in this reader are `s`
147    fn starts_with(&self, s: &str) -> bool {
148        self.s[self.off..].starts_with(s)
149    }
150    /// Try to extract a NL-terminated line from this reader.  Always
151    /// remove data if the reader is nonempty.
152    fn line(&mut self) -> Result<&'a str> {
153        let remainder = &self.s[self.off..];
154        if let Some(nl_pos) = remainder.find('\n') {
155            self.advance(nl_pos + 1)?;
156            let line = &remainder[..nl_pos];
157
158            // TODO: we should probably detect \r and do something about it.
159            // Just ignoring it isn't the right answer, though.
160            Ok(line)
161        } else {
162            self.advance(remainder.len())?; // drain everything.
163            Err(EK::TruncatedLine.at_pos(self.pos(self.s.len())))
164        }
165    }
166
167    /// Try to extract a line that begins with a keyword from this reader.
168    ///
169    /// Returns a (kwd, args) tuple on success.
170    fn kwdline(&mut self) -> Result<(&'a str, &'a str)> {
171        let pos = self.off;
172        let line = self.line()?;
173        if line.is_empty() {
174            return Err(EK::EmptyLine.at_pos(self.pos(pos)));
175        }
176        let (line, anno_ok) = if let Some(rem) = line.strip_prefix("opt ") {
177            (rem, false)
178        } else {
179            (line, true)
180        };
181        let mut parts_iter = line.splitn(2, [' ', '\t']);
182        let kwd = match parts_iter.next() {
183            Some(k) => k,
184            // This case seems like it can't happen: split always returns
185            // something, apparently.
186            None => return Err(EK::MissingKeyword.at_pos(self.pos(pos))),
187        };
188        if !keyword_ok(kwd, anno_ok) {
189            return Err(EK::BadKeyword.at_pos(self.pos(pos)));
190        }
191        // TODO(nickm): dir-spec does not yet allow unicode in the arguments, but we're
192        // assuming that proposal 285 is accepted.
193        let args = match parts_iter.next() {
194            Some(a) => a,
195            // take a zero-length slice, so it will be within the string.
196            None => &kwd[kwd.len()..],
197        };
198        Ok((kwd, args))
199    }
200
201    /// Try to extract an Object beginning wrapped within BEGIN/END tags.
202    ///
203    /// Returns Ok(Some(Object(...))) on success if an object is
204    /// found, Ok(None) if no object is found, and Err only if a
205    /// corrupt object is found.
206    fn object(&mut self) -> Result<Option<Object<'a>>> {
207        use object::*;
208
209        let pos = self.off;
210        if !self.starts_with(BEGIN_STR) {
211            return Ok(None);
212        }
213        let line = self.line()?;
214        if !line.ends_with(TAG_END) {
215            return Err(EK::BadObjectBeginTag.at_pos(self.pos(pos)));
216        }
217        let tag = &line[BEGIN_STR.len()..(line.len() - TAG_END.len())];
218        if !tag_keywords_ok(tag) {
219            return Err(EK::BadObjectBeginTag.at_pos(self.pos(pos)));
220        }
221        let datapos = self.off;
222        let (endlinepos, endline) = loop {
223            let p = self.off;
224            let line = self.line()?;
225            if line.starts_with(END_STR) {
226                break (p, line);
227            }
228            // Exit if this line isn't plausible base64.  Otherwise,
229            // an unterminated base64 block could potentially
230            // "consume" all the rest of the string, which would stop
231            // us from recovering.
232            b64check(line).map_err(|e| e.within(self.s))?;
233        };
234        let data = &self.s[datapos..endlinepos];
235        if !endline.ends_with(TAG_END) {
236            return Err(EK::BadObjectEndTag.at_pos(self.pos(endlinepos)));
237        }
238        let endtag = &endline[END_STR.len()..(endline.len() - TAG_END.len())];
239        if endtag != tag {
240            return Err(EK::BadObjectMismatchedTag.at_pos(self.pos(endlinepos)));
241        }
242        Ok(Some(Object { tag, data, endline }))
243    }
244
245    /// Read the next Item from this NetDocReaderBase.
246    ///
247    /// If successful, returns Ok(Some(Item)), or Ok(None) if exhausted.
248    /// Returns Err on failure.
249    ///
250    /// Always consumes at least one line if possible; always ends on a
251    /// line boundary if one exists.
252    fn item(&mut self) -> Result<Option<Item<'a, K>>> {
253        if self.remaining() == 0 {
254            return Ok(None);
255        }
256        let (kwd_str, args) = self.kwdline()?;
257        let object = self.object()?;
258        let split_args = RefCell::new(None);
259        let kwd = K::from_str(kwd_str);
260        Ok(Some(Item {
261            kwd,
262            kwd_str,
263            args,
264            split_args,
265            object,
266        }))
267    }
268}
269
270/// Return true iff 's' is a valid keyword or annotation.
271///
272/// (Only allow annotations if `anno_ok` is true.`
273fn keyword_ok(mut s: &str, anno_ok: bool) -> bool {
274    /// Helper: return true if this character can appear in keywords.
275    fn kwd_char_ok(c: char) -> bool {
276        matches!(c,'A'..='Z' | 'a'..='z' |'0'..='9' | '-')
277    }
278
279    if s.is_empty() {
280        return false;
281    }
282    if anno_ok && s.starts_with('@') {
283        s = &s[1..];
284    }
285    if s.starts_with('-') {
286        return false;
287    }
288    s.chars().all(kwd_char_ok)
289}
290
291/// Return true iff 's' is a valid keywords string for a BEGIN/END tag.
292pub(crate) fn tag_keywords_ok(s: &str) -> bool {
293    s.split(' ').all(|w| keyword_ok(w, false))
294}
295
296/// When used as an Iterator, returns a sequence of `Result<Item>`.
297impl<'a, K: Keyword> Iterator for NetDocReaderBase<'a, K> {
298    type Item = Result<Item<'a, K>>;
299    fn next(&mut self) -> Option<Self::Item> {
300        self.item().transpose()
301    }
302}
303
304/// Helper: as base64::decode(), but allows newlines in the middle of the
305/// encoded object.
306pub(crate) fn base64_decode_multiline(s: &str) -> std::result::Result<Vec<u8>, base64ct::Error> {
307    // base64 module hates whitespace.
308    let mut s = s.to_string();
309    s.retain(|ch| ch != '\n');
310    let v = Base64::decode_vec(&s)?;
311    Ok(v)
312}
313
314impl<'a, K: Keyword> Item<'a, K> {
315    /// Return the parsed keyword part of this item.
316    pub(crate) fn kwd(&self) -> K {
317        self.kwd
318    }
319    /// Return the keyword part of this item, as a string.
320    pub(crate) fn kwd_str(&self) -> &'a str {
321        self.kwd_str
322    }
323    /// Return true if the keyword for this item is in 'ks'.
324    pub(crate) fn has_kwd_in(&self, ks: &[K]) -> bool {
325        ks.contains(&self.kwd)
326    }
327    /// Return the arguments of this item, as a single string.
328    pub(crate) fn args_as_str(&self) -> &'a str {
329        self.args
330    }
331    /// Return the arguments of this item as a vector.
332    fn args_as_vec(&self) -> Ref<'_, Vec<&'a str>> {
333        // We're using an interior mutability pattern here to lazily
334        // construct the vector.
335        if self.split_args.borrow().is_none() {
336            self.split_args.replace(Some(self.args().collect()));
337        }
338        Ref::map(self.split_args.borrow(), |opt| match opt {
339            Some(v) => v,
340            None => panic!(),
341        })
342    }
343    /// Return an iterator over the arguments of this item.
344    pub(crate) fn args(&self) -> impl Iterator<Item = &'a str> + use<'a, K> {
345        self.args.split(is_sp).filter(|s| !s.is_empty())
346    }
347    /// Return the nth argument of this item, if there is one.
348    pub(crate) fn arg(&self, idx: usize) -> Option<&'a str> {
349        self.args_as_vec().get(idx).copied()
350    }
351    /// Return the nth argument of this item, or an error if it isn't there.
352    pub(crate) fn required_arg(&self, idx: usize) -> Result<&'a str> {
353        self.arg(idx)
354            .ok_or_else(|| EK::MissingArgument.at_pos(Pos::at(self.args)))
355    }
356    /// Try to parse the nth argument (if it exists) into some type
357    /// that supports FromStr.
358    ///
359    /// Returns Ok(None) if the argument doesn't exist.
360    pub(crate) fn parse_optional_arg<V: FromStr>(&self, idx: usize) -> Result<Option<V>>
361    where
362        Error: From<V::Err>,
363    {
364        match self.arg(idx) {
365            None => Ok(None),
366            Some(s) => match s.parse() {
367                Ok(r) => Ok(Some(r)),
368                Err(e) => {
369                    let e: Error = e.into();
370                    Err(e.or_at_pos(Pos::at(s)))
371                }
372            },
373        }
374    }
375    /// Try to parse the nth argument (if it exists) into some type
376    /// that supports FromStr.
377    ///
378    /// Return an error if the argument doesn't exist.
379    pub(crate) fn parse_arg<V: FromStr>(&self, idx: usize) -> Result<V>
380    where
381        Error: From<V::Err>,
382    {
383        match self.parse_optional_arg(idx) {
384            Ok(Some(v)) => Ok(v),
385            Ok(None) => Err(EK::MissingArgument.at_pos(self.arg_pos(idx))),
386            Err(e) => Err(e),
387        }
388    }
389    /// Return the number of arguments for this Item
390    pub(crate) fn n_args(&self) -> usize {
391        self.args().count()
392    }
393    /// Return true iff this Item has an associated object.
394    pub(crate) fn has_obj(&self) -> bool {
395        self.object.is_some()
396    }
397    /// Return the tag of this item's associated object, if it has one.
398    pub(crate) fn obj_tag(&self) -> Option<&'a str> {
399        self.object.map(|o| o.tag)
400    }
401    /// Try to decode the base64 contents of this Item's associated object.
402    ///
403    /// On success, return the object's tag and decoded contents.
404    pub(crate) fn obj_raw(&self) -> Result<Option<(&'a str, Vec<u8>)>> {
405        match self.object {
406            None => Ok(None),
407            Some(obj) => {
408                let decoded = base64_decode_multiline(obj.data)
409                    .map_err(|_| EK::BadObjectBase64.at_pos(Pos::at(obj.data)))?;
410                Ok(Some((obj.tag, decoded)))
411            }
412        }
413    }
414    /// Try to decode the base64 contents of this Item's associated object,
415    /// and make sure that its tag matches 'want_tag'.
416    pub(crate) fn obj(&self, want_tag: &str) -> Result<Vec<u8>> {
417        match self.obj_raw()? {
418            None => Err(EK::MissingObject
419                .with_msg(self.kwd.to_str())
420                .at_pos(self.end_pos())),
421            Some((tag, decoded)) => {
422                if tag != want_tag {
423                    Err(EK::WrongObject.at_pos(Pos::at(tag)))
424                } else {
425                    Ok(decoded)
426                }
427            }
428        }
429    }
430    /// Try to decode the base64 contents of this item's associated object
431    /// as a given type that implements FromBytes.
432    pub(crate) fn parse_obj<V: FromBytes>(&self, want_tag: &str) -> Result<V> {
433        let bytes = self.obj(want_tag)?;
434        // Unwrap may be safe because above `.obj()` should return an Error if
435        // wanted tag was not present
436        #[allow(clippy::unwrap_used)]
437        let p = Pos::at(self.object.unwrap().data);
438        V::from_vec(bytes, p).map_err(|e| e.at_pos(p))
439    }
440    /// Return the position of this item.
441    ///
442    /// This position won't be useful unless it is later contextualized
443    /// with the containing string.
444    pub(crate) fn pos(&self) -> Pos {
445        Pos::at(self.kwd_str)
446    }
447    /// Return the position of this Item in a string.
448    ///
449    /// Returns None if this item doesn't actually belong to the string.
450    pub(crate) fn offset_in(&self, s: &str) -> Option<usize> {
451        crate::util::str::str_offset(s, self.kwd_str)
452    }
453    /// Return the position of the n'th argument of this item.
454    ///
455    /// If this item does not have a n'th argument, return the
456    /// position of the end of the final argument.
457    pub(crate) fn arg_pos(&self, n: usize) -> Pos {
458        let args = self.args_as_vec();
459        if n < args.len() {
460            Pos::at(args[n])
461        } else {
462            self.last_arg_end_pos()
463        }
464    }
465    /// Return the position at the end of the last argument.  (This will
466    /// point to a newline.)
467    fn last_arg_end_pos(&self) -> Pos {
468        Pos::at_end_of(self.args)
469    }
470    /// Return the position of the end of this object. (This will point to a
471    /// newline.)
472    pub(crate) fn end_pos(&self) -> Pos {
473        match self.object {
474            Some(o) => Pos::at_end_of(o.endline),
475            None => self.last_arg_end_pos(),
476        }
477    }
478    /// If this item occurs within s, return the byte offset
479    /// immediately after the end of this item.
480    pub(crate) fn offset_after(&self, s: &str) -> Option<usize> {
481        self.end_pos().offset_within(s).map(|nl_pos| nl_pos + 1)
482    }
483
484    /// Return the text of this item, if it originated within `str`,
485    /// from the start of its keyword up to and including its final newline.
486    #[allow(dead_code)] // unused when hsdesc not enabled.
487    pub(crate) fn text_within<'b>(&self, s: &'b str) -> Option<&'b str> {
488        let start = self.pos().offset_within(s)?;
489        let end = self.end_pos().offset_within(s)?;
490        s.get(start..=end)
491    }
492}
493
494/// Represents an Item that might not be present, whose arguments we
495/// want to inspect.  If the Item is there, this acts like a proxy to the
496/// item; otherwise, it treats the item as having no arguments.
497pub(crate) struct MaybeItem<'a, 'b, K: Keyword>(Option<&'a Item<'b, K>>);
498
499// All methods here are as for Item.
500impl<'a, 'b, K: Keyword> MaybeItem<'a, 'b, K> {
501    /// Return the position of this item, if it has one.
502    fn pos(&self) -> Pos {
503        match self.0 {
504            Some(item) => item.pos(),
505            None => Pos::None,
506        }
507    }
508    /// Construct a MaybeItem from an Option reference to an item.
509    pub(crate) fn from_option(opt: Option<&'a Item<'b, K>>) -> Self {
510        MaybeItem(opt)
511    }
512
513    /// If this item is present, parse its argument at position `idx`.
514    /// Treat the absence or malformedness of the argument as an error,
515    /// but treat the absence of this item as acceptable.
516    pub(crate) fn parse_arg<V: FromStr>(&self, idx: usize) -> Result<Option<V>>
517    where
518        Error: From<V::Err>,
519    {
520        match self.0 {
521            Some(item) => match item.parse_arg(idx) {
522                Ok(v) => Ok(Some(v)),
523                Err(e) => Err(e.or_at_pos(self.pos())),
524            },
525            None => Ok(None),
526        }
527    }
528    /// If this item is present, return its arguments as a single string.
529    pub(crate) fn args_as_str(&self) -> Option<&str> {
530        self.0.map(|item| item.args_as_str())
531    }
532    /// If this item is present, parse all of its arguments as a
533    /// single string.
534    pub(crate) fn parse_args_as_str<V: FromStr>(&self) -> Result<Option<V>>
535    where
536        Error: From<V::Err>,
537    {
538        match self.0 {
539            Some(item) => match item.args_as_str().parse::<V>() {
540                Ok(v) => Ok(Some(v)),
541                Err(e) => {
542                    let e: Error = e.into();
543                    Err(e.or_at_pos(self.pos()))
544                }
545            },
546            None => Ok(None),
547        }
548    }
549}
550
551/// Extension trait for `Result<Item>` -- makes it convenient to implement
552/// PauseAt predicates
553pub(crate) trait ItemResult<K: Keyword> {
554    /// Return true if this is an ok result with an annotation.
555    fn is_ok_with_annotation(&self) -> bool;
556    /// Return true if this is an ok result with a non-annotation.
557    fn is_ok_with_non_annotation(&self) -> bool;
558    /// Return true if this is an ok result with the keyword 'k'
559    fn is_ok_with_kwd(&self, k: K) -> bool {
560        self.is_ok_with_kwd_in(&[k])
561    }
562    /// Return true if this is an ok result with a keyword in the slice 'ks'
563    fn is_ok_with_kwd_in(&self, ks: &[K]) -> bool;
564    /// Return true if this is an ok result with a keyword not in the slice 'ks'
565    fn is_ok_with_kwd_not_in(&self, ks: &[K]) -> bool;
566    /// Return true if this is an empty-line error.
567    fn is_empty_line(&self) -> bool;
568}
569
570impl<'a, K: Keyword> ItemResult<K> for Result<Item<'a, K>> {
571    fn is_ok_with_annotation(&self) -> bool {
572        match self {
573            Ok(item) => item.kwd().is_annotation(),
574            Err(_) => false,
575        }
576    }
577    fn is_ok_with_non_annotation(&self) -> bool {
578        match self {
579            Ok(item) => !item.kwd().is_annotation(),
580            Err(_) => false,
581        }
582    }
583    fn is_ok_with_kwd_in(&self, ks: &[K]) -> bool {
584        match self {
585            Ok(item) => item.has_kwd_in(ks),
586            Err(_) => false,
587        }
588    }
589    fn is_ok_with_kwd_not_in(&self, ks: &[K]) -> bool {
590        match self {
591            Ok(item) => !item.has_kwd_in(ks),
592            Err(_) => false,
593        }
594    }
595    fn is_empty_line(&self) -> bool {
596        matches!(
597            self,
598            Err(e) if e.netdoc_error_kind() == crate::err::NetdocErrorKind::EmptyLine
599        )
600    }
601}
602
603/// A peekable cursor into a string that returns Items one by one.
604///
605/// This is an [`Iterator`], yielding [`Item`]s.
606#[derive(Debug)]
607pub(crate) struct NetDocReader<'a, K: Keyword> {
608    // TODO: I wish there were some way around having this string
609    // reference, since we already need one inside NetDocReaderBase.
610    /// The underlying string being parsed.
611    s: &'a str,
612    /// A stream of tokens being parsed by this NetDocReader.
613    tokens: Peekable<NetDocReaderBase<'a, K>>,
614}
615
616impl<'a, K: Keyword> NetDocReader<'a, K> {
617    /// Construct a new NetDocReader to read tokens from `s`.
618    pub(crate) fn new(s: &'a str) -> Result<Self> {
619        Ok(NetDocReader {
620            s,
621            tokens: NetDocReaderBase::new(s)?.peekable(),
622        })
623    }
624    /// Return a reference to the string used for this NetDocReader.
625    pub(crate) fn str(&self) -> &'a str {
626        self.s
627    }
628    /// Return a wrapper around the peekable iterator in this
629    /// NetDocReader that reads tokens until it reaches an element where
630    /// 'f' is true.
631    pub(crate) fn pause_at<'f, 'r, F>(
632        &mut self,
633        mut f: F,
634    ) -> itertools::PeekingTakeWhile<
635        '_,
636        Self,
637        impl FnMut(&Result<Item<'a, K>>) -> bool + 'f + use<'a, 'f, F, K>,
638    >
639    where
640        'f: 'r,
641        F: FnMut(&Result<Item<'a, K>>) -> bool + 'f,
642        K: 'f,
643    {
644        self.peeking_take_while(move |i| !f(i))
645    }
646
647    /// Return true if there are no more items in this NetDocReader.
648    // The implementation sadly needs to mutate the inner state, even if it's not *semantically*
649    // mutated..  We don't want inner mutability just to placate clippy for an internal API.
650    #[allow(clippy::wrong_self_convention)]
651    #[allow(dead_code)] // TODO perhaps we should remove this ?
652    pub(crate) fn is_exhausted(&mut self) -> bool {
653        self.peek().is_none()
654    }
655
656    /// Give an error if there are remaining tokens in this NetDocReader.
657    pub(crate) fn should_be_exhausted(&mut self) -> Result<()> {
658        match self.peek() {
659            None => Ok(()),
660            Some(Ok(t)) => Err(EK::UnexpectedToken
661                .with_msg(t.kwd().to_str())
662                .at_pos(t.pos())),
663            Some(Err(e)) => Err(e.clone()),
664        }
665    }
666
667    /// Give an error if there are remaining tokens in this NetDocReader.
668    ///
669    /// Like [`should_be_exhausted`](Self::should_be_exhausted),
670    /// but permit empty lines at the end of the document.
671    pub(crate) fn should_be_exhausted_but_for_empty_lines(&mut self) -> Result<()> {
672        use crate::err::NetdocErrorKind as K;
673        while let Some(Err(e)) = self.peek() {
674            if e.netdoc_error_kind() == K::EmptyLine {
675                let _ignore = self.next();
676            } else {
677                break;
678            }
679        }
680        self.should_be_exhausted()
681    }
682
683    /// Return the position from which the underlying reader is about to take
684    /// the next token.  Use to make sure that the reader is progressing.
685    pub(crate) fn pos(&mut self) -> Pos {
686        match self.tokens.peek() {
687            Some(Ok(tok)) => tok.pos(),
688            Some(Err(e)) => e.pos(),
689            None => Pos::at_end_of(self.s),
690        }
691    }
692}
693
694impl<'a, K: Keyword> Iterator for NetDocReader<'a, K> {
695    type Item = Result<Item<'a, K>>;
696    fn next(&mut self) -> Option<Self::Item> {
697        self.tokens.next()
698    }
699}
700
701impl<'a, K: Keyword> PeekableIterator for NetDocReader<'a, K> {
702    fn peek(&mut self) -> Option<&Self::Item> {
703        self.tokens.peek()
704    }
705}
706
707impl<'a, K: Keyword> itertools::PeekingNext for NetDocReader<'a, K> {
708    fn peeking_next<F>(&mut self, f: F) -> Option<Self::Item>
709    where
710        F: FnOnce(&Self::Item) -> bool,
711    {
712        if f(self.peek()?) { self.next() } else { None }
713    }
714}
715
716/// Check additional UTF-8 rules that the netdoc metaformat imposes on
717/// our documents.
718//
719// NOTE: We might decide in the future to loosen our rules here
720// for parsers that handle concatenated documents:
721// we might want to reject only those documents that contain NULs.
722// But with luck that will never be necessary.
723fn validate_utf_8_rules(s: &str) -> Result<&str> {
724    // No BOM, or mangled BOM, is allowed.
725    let first_char = s.chars().next();
726    if [Some('\u{feff}'), Some('\u{fffe}')].contains(&first_char) {
727        return Err(EK::BomMarkerFound.at_pos(Pos::at(s)));
728    }
729    // No NUL bytes are allowed.
730    if let Some(nul_pos) = memchr::memchr(0, s.as_bytes()) {
731        return Err(EK::NulFound.at_pos(Pos::from_byte(nul_pos)));
732    }
733    Ok(s)
734}
735
736#[cfg(test)]
737mod test {
738    // @@ begin test lint list maintained by maint/add_warning @@
739    #![allow(clippy::bool_assert_comparison)]
740    #![allow(clippy::clone_on_copy)]
741    #![allow(clippy::dbg_macro)]
742    #![allow(clippy::mixed_attributes_style)]
743    #![allow(clippy::print_stderr)]
744    #![allow(clippy::print_stdout)]
745    #![allow(clippy::single_char_pattern)]
746    #![allow(clippy::unwrap_used)]
747    #![allow(clippy::unchecked_time_subtraction)]
748    #![allow(clippy::useless_vec)]
749    #![allow(clippy::needless_pass_by_value)]
750    //! <!-- @@ end test lint list maintained by maint/add_warning @@ -->
751    #![allow(clippy::cognitive_complexity)]
752    use super::*;
753    use crate::parse::macros::test::Fruit;
754    use crate::{NetdocErrorKind as EK, Pos, Result};
755
756    #[test]
757    fn read_simple() {
758        use Fruit::*;
759
760        let s = "\
761@tasty very much so
762opt apple 77
763banana 60
764cherry 6
765-----BEGIN CHERRY SYNOPSIS-----
7668J+NkvCfjZLwn42S8J+NkvCfjZLwn42S
767-----END CHERRY SYNOPSIS-----
768plum hello there
769";
770        let mut r: NetDocReader<'_, Fruit> = NetDocReader::new(s).unwrap();
771
772        assert_eq!(r.str(), s);
773        assert!(r.should_be_exhausted().is_err()); // it's not exhausted.
774
775        let toks: Result<Vec<_>> = r.by_ref().collect();
776        assert!(r.should_be_exhausted().is_ok());
777
778        let toks = toks.unwrap();
779        assert_eq!(toks.len(), 5);
780        assert_eq!(toks[0].kwd(), ANN_TASTY);
781        assert_eq!(toks[0].n_args(), 3);
782        assert_eq!(toks[0].args_as_str(), "very much so");
783        assert_eq!(toks[0].arg(1), Some("much"));
784        {
785            let a: Vec<_> = toks[0].args().collect();
786            assert_eq!(a, vec!["very", "much", "so"]);
787        }
788        assert!(toks[0].parse_arg::<usize>(0).is_err());
789        assert!(toks[0].parse_arg::<usize>(10).is_err());
790        assert!(!toks[0].has_obj());
791        assert_eq!(toks[0].obj_tag(), None);
792
793        assert_eq!(toks[2].pos().within(s), Pos::from_line(3, 1));
794        assert_eq!(toks[2].arg_pos(0).within(s), Pos::from_line(3, 8));
795        assert_eq!(toks[2].last_arg_end_pos().within(s), Pos::from_line(3, 10));
796        assert_eq!(toks[2].end_pos().within(s), Pos::from_line(3, 10));
797
798        assert_eq!(toks[3].kwd(), STONEFRUIT);
799        assert_eq!(toks[3].kwd_str(), "cherry"); // not cherry/plum!
800        assert_eq!(toks[3].n_args(), 1);
801        assert_eq!(toks[3].required_arg(0), Ok("6"));
802        assert_eq!(toks[3].parse_arg::<usize>(0), Ok(6));
803        assert_eq!(toks[3].parse_optional_arg::<usize>(0), Ok(Some(6)));
804        assert_eq!(toks[3].parse_optional_arg::<usize>(3), Ok(None));
805        assert!(toks[3].has_obj());
806        assert_eq!(toks[3].obj_tag(), Some("CHERRY SYNOPSIS"));
807        assert_eq!(
808            &toks[3].obj("CHERRY SYNOPSIS").unwrap()[..],
809            "🍒🍒🍒🍒🍒🍒".as_bytes()
810        );
811        assert!(toks[3].obj("PLUOT SYNOPSIS").is_err());
812        // this "end-pos" value is questionable!
813        assert_eq!(toks[3].end_pos().within(s), Pos::from_line(7, 30));
814    }
815
816    #[test]
817    fn test_badtoks() {
818        use Fruit::*;
819
820        let s = "\
821-foobar 9090
822apple 3.14159
823$hello
824unrecognized 127.0.0.1 foo
825plum
826-----BEGIN WHATEVER-----
8278J+NkvCfjZLwn42S8J+NkvCfjZLwn42S
828-----END SOMETHING ELSE-----
829orange
830orange
831-----BEGIN WHATEVER-----
832not! base64!
833-----END WHATEVER-----
834guava paste
835opt @annotation
836orange
837-----BEGIN LOBSTER
8388J+NkvCfjZLwn42S8J+NkvCfjZLwn42S
839-----END SOMETHING ELSE-----
840orange
841-----BEGIN !!!!!!-----
8428J+NkvCfjZLwn42S8J+NkvCfjZLwn42S
843-----END !!!!!!-----
844cherry
845-----BEGIN CHERRY SYNOPSIS-----
8468J+NkvCfjZLwn42S8J+NkvCfjZLwn42S
847-----END CHERRY SYNOPSIS
848
849truncated line";
850
851        let r: NetDocReader<'_, Fruit> = NetDocReader::new(s).unwrap();
852        let toks: Vec<_> = r.collect();
853
854        assert!(toks[0].is_err());
855        assert_eq!(
856            toks[0].as_ref().err().unwrap(),
857            &EK::BadKeyword.at_pos(Pos::from_line(1, 1))
858        );
859
860        assert!(toks[1].is_ok());
861        assert!(toks[1].is_ok_with_non_annotation());
862        assert!(!toks[1].is_ok_with_annotation());
863        assert!(toks[1].is_ok_with_kwd_in(&[APPLE, ORANGE]));
864        assert!(toks[1].is_ok_with_kwd_not_in(&[ORANGE, UNRECOGNIZED]));
865        let t = toks[1].as_ref().unwrap();
866        assert_eq!(t.kwd(), APPLE);
867        assert_eq!(t.arg(0), Some("3.14159"));
868
869        assert!(toks[2].is_err());
870        assert!(!toks[2].is_ok_with_non_annotation());
871        assert!(!toks[2].is_ok_with_annotation());
872        assert!(!toks[2].is_ok_with_kwd_in(&[APPLE, ORANGE]));
873        assert!(!toks[2].is_ok_with_kwd_not_in(&[ORANGE, UNRECOGNIZED]));
874        assert_eq!(
875            toks[2].as_ref().err().unwrap(),
876            &EK::BadKeyword.at_pos(Pos::from_line(3, 1))
877        );
878
879        assert!(toks[3].is_ok());
880        let t = toks[3].as_ref().unwrap();
881        assert_eq!(t.kwd(), UNRECOGNIZED);
882        assert_eq!(t.arg(1), Some("foo"));
883
884        assert!(toks[4].is_err());
885        assert_eq!(
886            toks[4].as_ref().err().unwrap(),
887            &EK::BadObjectMismatchedTag.at_pos(Pos::from_line(8, 1))
888        );
889
890        assert!(toks[5].is_ok());
891        let t = toks[5].as_ref().unwrap();
892        assert_eq!(t.kwd(), ORANGE);
893        assert_eq!(t.args_as_str(), "");
894
895        // This blob counts as two errors: a bad base64 blob, and
896        // then an end line.
897        assert!(toks[6].is_err());
898        assert_eq!(
899            toks[6].as_ref().err().unwrap(),
900            &EK::BadObjectBase64.at_pos(Pos::from_line(12, 1))
901        );
902
903        assert!(toks[7].is_err());
904        assert_eq!(
905            toks[7].as_ref().err().unwrap(),
906            &EK::BadKeyword.at_pos(Pos::from_line(13, 1))
907        );
908
909        assert!(toks[8].is_ok());
910        let t = toks[8].as_ref().unwrap();
911        assert_eq!(t.kwd(), GUAVA);
912
913        // this is an error because you can't use opt with annotations.
914        assert!(toks[9].is_err());
915        assert_eq!(
916            toks[9].as_ref().err().unwrap(),
917            &EK::BadKeyword.at_pos(Pos::from_line(15, 1))
918        );
919
920        // this looks like a few errors.
921        assert!(toks[10].is_err());
922        assert_eq!(
923            toks[10].as_ref().err().unwrap(),
924            &EK::BadObjectBeginTag.at_pos(Pos::from_line(17, 1))
925        );
926        assert!(toks[11].is_err());
927        assert_eq!(
928            toks[11].as_ref().err().unwrap(),
929            &EK::BadKeyword.at_pos(Pos::from_line(18, 1))
930        );
931        assert!(toks[12].is_err());
932        assert_eq!(
933            toks[12].as_ref().err().unwrap(),
934            &EK::BadKeyword.at_pos(Pos::from_line(19, 1))
935        );
936
937        // so does this.
938        assert!(toks[13].is_err());
939        assert_eq!(
940            toks[13].as_ref().err().unwrap(),
941            &EK::BadObjectBeginTag.at_pos(Pos::from_line(21, 1))
942        );
943        assert!(toks[14].is_err());
944        assert_eq!(
945            toks[14].as_ref().err().unwrap(),
946            &EK::BadKeyword.at_pos(Pos::from_line(22, 1))
947        );
948        assert!(toks[15].is_err());
949        assert_eq!(
950            toks[15].as_ref().err().unwrap(),
951            &EK::BadKeyword.at_pos(Pos::from_line(23, 1))
952        );
953
954        // not this.
955        assert!(toks[16].is_err());
956        assert_eq!(
957            toks[16].as_ref().err().unwrap(),
958            &EK::BadObjectEndTag.at_pos(Pos::from_line(27, 1))
959        );
960
961        assert!(toks[17].is_err());
962        assert_eq!(
963            toks[17].as_ref().err().unwrap(),
964            &EK::EmptyLine.at_pos(Pos::from_line(28, 1))
965        );
966
967        assert!(toks[18].is_err());
968        assert_eq!(
969            toks[18].as_ref().err().unwrap(),
970            &EK::TruncatedLine.at_pos(Pos::from_line(29, 15))
971        );
972    }
973
974    #[test]
975    fn test_leading_space_forbidden() {
976        // We need to make sure that items with a leading space aren't accepted:
977        // the spec forbids it, and it can provide a vector for inflating the size
978        // of downloaded hsdescs (see prop360).
979
980        // Try a simple item with a space at the front.
981        let s = "    guava space\n";
982        let r: NetDocReader<'_, Fruit> = NetDocReader::new(s).unwrap();
983        let toks: Vec<_> = r.collect();
984
985        // No space allowed at the start of a line.
986        assert_eq!(
987            toks[0].as_ref().err().unwrap(),
988            &EK::BadKeyword.at_pos(Pos::from_line(1, 1))
989        );
990
991        // Try an item with an object, inserting space at the start of each ine in turn.
992        let s = "cherry
993-----BEGIN WHATEVER-----
9948J+NkvCfjZLwn42S8J+NkvCfjZLwn42S
995-----END WHATEVER-----
996";
997
998        let orig_lines = s
999            .split_terminator('\n')
1000            .map(str::to_string)
1001            .collect::<Vec<_>>();
1002        assert_eq!(orig_lines.len(), 4);
1003        let expected_kinds = [
1004            EK::BadKeyword,
1005            EK::BadKeyword,
1006            EK::BadObjectBase64,
1007            EK::BadObjectBase64,
1008        ];
1009        for pos in 0..orig_lines.len() {
1010            let mut lines = orig_lines.clone();
1011            lines[pos] = format!(" {}", lines[pos]);
1012            let joined = format!("{}\n", lines.join("\n"));
1013
1014            let r: NetDocReader<'_, Fruit> = NetDocReader::new(&joined).unwrap();
1015            let toks: Result<Vec<_>> = r.collect();
1016            assert_eq!(toks.unwrap_err().netdoc_error_kind(), expected_kinds[pos]);
1017        }
1018    }
1019
1020    #[test]
1021    fn test_validate_strings() {
1022        use validate_utf_8_rules as v;
1023        assert_eq!(v(""), Ok(""));
1024        assert_eq!(v("hello world"), Ok("hello world"));
1025        // We don't have to test a lot more valid cases, since this function is called before
1026        // parsing any string.
1027
1028        for s in ["\u{feff}", "\u{feff}hello world", "\u{fffe}hello world"] {
1029            let e = v(s).unwrap_err();
1030            assert_eq!(e.netdoc_error_kind(), EK::BomMarkerFound);
1031            assert_eq!(e.pos().offset_within(s), Some(0));
1032        }
1033
1034        for s in [
1035            "\0hello world",
1036            "\0",
1037            "\0\0\0",
1038            "hello\0world",
1039            "hello world\0",
1040        ] {
1041            let e = v(s).unwrap_err();
1042            assert_eq!(e.netdoc_error_kind(), EK::NulFound);
1043            let nul_pos = e.pos().offset_within(s).unwrap();
1044            assert_eq!(s.as_bytes()[nul_pos], 0);
1045        }
1046    }
1047
1048    fn single_fruit(s: &str) -> Item<'_, Fruit> {
1049        NetDocReader::<Fruit>::new(s)
1050            .unwrap()
1051            .next()
1052            .unwrap()
1053            .unwrap()
1054    }
1055
1056    #[test]
1057    fn end_of_item() {
1058        let s = "guava friends 123   \n";
1059        let item = single_fruit(s);
1060        assert_eq!(
1061            item.end_pos().within(s),
1062            Pos::from_byte(s.find('\n').unwrap()).within(s)
1063        );
1064
1065        let s = "cherry
1066-----BEGIN WHATEVER-----
10678J+NkvCfjZLwn42S8J+NkvCfjZLwn42S
1068-----END WHATEVER-----\n";
1069        let item = single_fruit(s);
1070        dbg!(&item);
1071        assert_eq!(
1072            item.end_pos().within(s),
1073            Pos::from_byte(s.rfind('\n').unwrap()).within(s)
1074        );
1075    }
1076}