1use alloc::borrow::Cow;
9use alloc::string::String;
10use alloc::vec::Vec;
11use core::{char, iter::Peekable};
12
13use crate::serialize::txt::errors::{LexerError, LexerResult};
14
15pub(crate) struct Lexer<'a> {
17 txt: Peekable<CowChars<'a>>,
18 state: State,
19}
20
21impl<'a> Lexer<'a> {
22 pub(crate) fn new(txt: impl Into<Cow<'a, str>>) -> Self {
24 Lexer {
25 txt: CowChars {
26 data: txt.into(),
27 offset: 0,
28 }
29 .peekable(),
30 state: State::StartLine,
31 }
32 }
33
34 pub(crate) fn next_token(&mut self) -> LexerResult<Option<Token>> {
36 let mut char_data_vec: Option<Vec<String>> = None;
37 let mut char_data: Option<String> = None;
38
39 for i in 0..4_096 {
40 assert!(i < 4095); let ch: Option<char> = self.peek();
45
46 match self.state {
53 State::StartLine => {
54 match ch {
55 Some('\r') | Some('\n') => {
56 self.state = State::EOL;
57 }
58 Some(ch) if ch.is_whitespace() => self.state = State::Blank,
60 Some(_) => self.state = State::RestOfLine,
61 None => {
62 self.state = State::EOF;
63 }
64 }
65 }
66 State::RestOfLine => {
67 match ch {
68 Some('@') => self.state = State::At,
69 Some('(') => {
70 self.txt.next();
71 char_data_vec = Some(Vec::new());
72 self.state = State::List;
73 }
74 Some(ch @ ')') => return Err(LexerError::IllegalCharacter(ch)),
75 Some('$') => {
76 self.txt.next();
77 char_data = Some(String::new());
78 self.state = State::Dollar;
79 }
80 Some('\r') | Some('\n') => {
81 self.state = State::EOL;
82 }
83 Some('"') => {
84 self.txt.next();
85 char_data = Some(String::new());
86 self.state = State::Quote;
87 }
88 Some(';') => self.state = State::Comment { is_list: false },
89 Some(ch) if ch.is_whitespace() => {
90 self.txt.next();
91 } Some(ch) if !ch.is_control() && !ch.is_whitespace() => {
93 char_data = Some(String::new());
94 self.state = State::CharData { is_list: false };
95 }
96 Some(ch) => return Err(LexerError::UnrecognizedChar(ch)),
97 None => {
98 self.state = State::EOF;
99 }
100 }
101 }
102 State::Blank => {
103 self.txt.next();
105 self.state = State::RestOfLine;
106 return Ok(Some(Token::Blank));
107 }
108 State::Comment { is_list } => {
109 match ch {
110 Some('\r') | Some('\n') => {
111 self.state = if is_list { State::List } else { State::EOL };
112 } Some(_) => {
114 self.txt.next();
115 } None => {
117 self.state = State::EOF;
118 }
119 }
120 }
121 State::Quote => {
122 match ch {
123 Some('"') => {
125 self.state = State::RestOfLine;
126 self.txt.next();
127 return Ok(Some(Token::CharData(
128 char_data.take().unwrap_or_else(|| "".into()),
129 )));
130 }
131 Some('\\') => {
132 Self::push_to_str(&mut char_data, self.escape_seq()?)?;
133 }
134 Some(ch) => {
135 self.txt.next();
136 Self::push_to_str(&mut char_data, ch)?;
137 }
138 None => return Err(LexerError::UnclosedQuotedString),
139 }
140 }
141 State::Dollar => {
142 match ch {
143 Some(ch @ 'A'..='Z') => {
145 self.txt.next();
146 Self::push_to_str(&mut char_data, ch)?;
147 }
148 Some(_) | None => {
150 self.state = State::RestOfLine;
151 let dollar: String = char_data.take().ok_or({
152 LexerError::IllegalState(
153 "char_data \
154 is None",
155 )
156 })?;
157
158 return Ok(Some(match dollar.as_str() {
159 "INCLUDE" => Token::Include,
160 "ORIGIN" => Token::Origin,
161 "TTL" => Token::Ttl,
162 _ => {
163 return Err(LexerError::UnrecognizedDollar(
164 char_data.take().unwrap_or_else(|| "".into()),
165 ));
166 }
167 }));
168 }
169 }
170 }
171 State::List => match ch {
172 Some(';') => {
173 self.txt.next();
174 self.state = State::Comment { is_list: true }
175 }
176 Some(')') => {
177 self.txt.next();
178 self.state = State::RestOfLine;
179 return match char_data_vec.take() {
180 Some(v) => Ok(Some(Token::List(v))),
181 None => Err(LexerError::IllegalState("char_data_vec is None")),
182 };
183 }
184 Some(ch) if ch.is_whitespace() => {
185 self.txt.next();
186 }
187 Some(ch) if !ch.is_control() && !ch.is_whitespace() => {
188 char_data = Some(String::new());
189 self.state = State::CharData { is_list: true }
190 }
191 Some(ch) => return Err(LexerError::UnrecognizedChar(ch)),
192 None => return Err(LexerError::UnclosedList),
193 },
194 State::CharData { is_list } => {
195 match ch {
196 Some(ch @ ')') if !is_list => {
197 return Err(LexerError::IllegalCharacter(ch));
198 }
199 Some(ch) if ch.is_whitespace() || ch == ')' || ch == ';' => {
200 if is_list {
201 char_data_vec
202 .as_mut()
203 .ok_or(LexerError::IllegalState("char_data_vec is None"))
204 .and_then(|v| {
205 let char_data = char_data
206 .take()
207 .ok_or(LexerError::IllegalState("char_data is None"))?;
208
209 v.push(char_data);
210 Ok(())
211 })?;
212 self.state = State::List;
213 } else {
214 self.state = State::RestOfLine;
215 return match char_data.take() {
216 Some(s) => Ok(Some(Token::CharData(s))),
217 None => Err(LexerError::IllegalState("char_data is None")),
218 };
219 }
220 }
221 Some(ch) if !ch.is_control() && !ch.is_whitespace() => {
224 self.txt.next();
225 Self::push_to_str(&mut char_data, ch)?;
226 }
227 Some(ch) => return Err(LexerError::UnrecognizedChar(ch)),
228 None => {
229 self.state = State::EOF;
230 return char_data
231 .take()
232 .ok_or(LexerError::IllegalState("char_data is None"))
233 .map(|s| Some(Token::CharData(s)));
234 }
235 }
236 }
237 State::At => {
238 self.txt.next();
239 self.state = State::RestOfLine;
240 return Ok(Some(Token::At));
241 }
242 State::EOL => match ch {
243 Some('\r') => {
244 self.txt.next();
245 }
246 Some('\n') => {
247 self.txt.next();
248 self.state = State::StartLine;
249 return Ok(Some(Token::EOL));
250 }
251 Some(ch) => return Err(LexerError::IllegalCharacter(ch)),
252 None => return Err(LexerError::EOF),
253 },
254 State::EOF => {
256 self.txt.next(); return Ok(None);
258 }
259 }
260 }
261
262 unreachable!("The above match statement should have found a terminal state");
263 }
264
265 fn push_to_str(collect: &mut Option<String>, ch: char) -> LexerResult<()> {
266 let Some(s) = collect.as_mut() else {
267 return Err(LexerError::IllegalState("collect is None"));
268 };
269
270 s.push(ch);
271 Ok(())
272 }
273
274 fn escape_seq(&mut self) -> LexerResult<char> {
275 self.txt.next(); let ch = self.peek().ok_or(LexerError::EOF)?;
278
279 if !ch.is_control() {
280 if ch.is_numeric() {
281 let d1: u32 = self
283 .txt
284 .next()
285 .ok_or(LexerError::EOF)
286 .map(|c| c.to_digit(10).ok_or(LexerError::IllegalCharacter(c)))??; let d2: u32 = self
288 .txt
289 .next()
290 .ok_or(LexerError::EOF)
291 .map(|c| c.to_digit(10).ok_or(LexerError::IllegalCharacter(c)))??; let d3: u32 = self
293 .txt
294 .next()
295 .ok_or(LexerError::EOF)
296 .map(|c| c.to_digit(10).ok_or(LexerError::IllegalCharacter(c)))??; let val: u32 = (d1 << 16) + (d2 << 8) + d3;
299 let ch: char = char::from_u32(val).ok_or(LexerError::UnrecognizedOctet(val))?;
300
301 Ok(ch)
302 } else {
303 self.txt.next(); Ok(ch)
306 }
307 } else {
308 Err(LexerError::IllegalCharacter(ch))
309 }
310 }
311
312 fn peek(&mut self) -> Option<char> {
313 self.txt.peek().copied()
314 }
315}
316
317struct CowChars<'a> {
318 data: Cow<'a, str>,
319 offset: usize,
320}
321
322impl Iterator for CowChars<'_> {
323 type Item = char;
324
325 fn next(&mut self) -> Option<char> {
326 let mut iter = self.data[self.offset..].char_indices();
327 let (_, ch) = iter.next()?; match iter.next() {
329 Some((idx, _)) => self.offset += idx,
330 None => self.offset = self.data.len(),
331 }
332
333 Some(ch)
334 }
335}
336
337#[doc(hidden)]
338#[derive(Copy, Clone, PartialEq, Debug)]
339pub(crate) enum State {
340 StartLine,
341 RestOfLine,
342 Blank, List, CharData { is_list: bool }, Comment { is_list: bool }, At, Quote, Dollar, EOL, EOF,
352}
353
354#[derive(Eq, PartialEq, Debug, Clone)]
356pub enum Token {
357 Blank,
359 List(Vec<String>),
361 CharData(String),
363 At,
365 Include,
367 Origin,
369 Ttl,
371 EOL,
373}
374
375#[cfg(test)]
376mod lex_test {
377 use alloc::string::ToString;
378
379 use super::*;
380
381 #[allow(clippy::uninlined_format_args)]
382 fn next_token(lexer: &mut Lexer<'_>) -> Option<Token> {
383 let result = lexer.next_token();
384 assert!(result.is_ok(), "{:?}", result);
385 result.unwrap()
386 }
387
388 #[test]
389 fn blank() {
390 let mut lexer = Lexer::new(" dead beef");
392 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
393 assert_eq!(
394 next_token(&mut lexer).unwrap(),
395 Token::CharData("dead".to_string())
396 );
397 assert_eq!(
398 next_token(&mut lexer).unwrap(),
399 Token::CharData("beef".to_string())
400 );
401
402 let mut lexer = Lexer::new("dead beef");
404 assert_eq!(
405 next_token(&mut lexer).unwrap(),
406 Token::CharData("dead".to_string())
407 );
408 assert_eq!(
409 next_token(&mut lexer).unwrap(),
410 Token::CharData("beef".to_string())
411 );
412
413 let mut lexer = Lexer::new("dead beef\r\n after");
414 assert_eq!(
415 next_token(&mut lexer).unwrap(),
416 Token::CharData("dead".to_string())
417 );
418 assert_eq!(
419 next_token(&mut lexer).unwrap(),
420 Token::CharData("beef".to_string())
421 );
422 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
423 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
424 assert_eq!(
425 next_token(&mut lexer).unwrap(),
426 Token::CharData("after".to_string())
427 );
428
429 let mut lexer = Lexer::new(
430 "dead beef ();comment
431 after",
432 );
433 assert_eq!(
434 next_token(&mut lexer).unwrap(),
435 Token::CharData("dead".to_string())
436 );
437 assert_eq!(
438 next_token(&mut lexer).unwrap(),
439 Token::CharData("beef".to_string())
440 );
441 assert_eq!(next_token(&mut lexer).unwrap(), Token::List(vec![]));
442 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
443 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
444 assert_eq!(
445 next_token(&mut lexer).unwrap(),
446 Token::CharData("after".to_string())
447 );
448 }
449
450 #[test]
451 fn escape() {
452 assert_eq!(
453 Lexer::new("a\\Aa").next_token().unwrap().unwrap(),
454 Token::CharData("a\\Aa".to_string())
455 );
456 assert_eq!(
457 Lexer::new("a\\$").next_token().unwrap().unwrap(),
458 Token::CharData("a\\$".to_string())
459 );
460 assert_eq!(
461 Lexer::new("a\\077").next_token().unwrap().unwrap(),
462 Token::CharData("a\\077".to_string())
463 );
464 }
465
466 #[test]
467 fn quoted_txt() {
468 assert_eq!(
469 Lexer::new("\"Quoted\"").next_token().unwrap().unwrap(),
470 Token::CharData("Quoted".to_string())
471 );
472 assert_eq!(
473 Lexer::new("\";@$\"").next_token().unwrap().unwrap(),
474 Token::CharData(";@$".to_string())
475 );
476 assert_eq!(
477 Lexer::new("\"some \\A\"").next_token().unwrap().unwrap(),
478 Token::CharData("some A".to_string())
479 );
480 assert_eq!(
481 Lexer::new("\"a\\Aa\"").next_token().unwrap().unwrap(),
482 Token::CharData("aAa".to_string())
483 );
484 assert_eq!(
485 Lexer::new("\"a\\$\"").next_token().unwrap().unwrap(),
486 Token::CharData("a$".to_string())
487 );
488 assert_eq!(
489 Lexer::new("\"a\\077\"").next_token().unwrap().unwrap(),
490 Token::CharData("a\u{707}".to_string())
491 );
492
493 assert!(Lexer::new("\"a\\\"").next_token().is_err());
494 assert!(Lexer::new("\"a\\0\"").next_token().is_err());
495 assert!(Lexer::new("\"a\\07\"").next_token().is_err());
496
497 let mut lexer = Lexer::new("\"multi\nline\ntext\"");
498
499 assert_eq!(
500 next_token(&mut lexer).unwrap(),
501 Token::CharData("multi\nline\ntext".to_string())
502 );
503 assert_eq!(next_token(&mut lexer), None);
504
505 let mut lexer = Lexer::new("\"multi\r\nline\r\ntext\"\r\n");
506
507 assert_eq!(
508 next_token(&mut lexer).unwrap(),
509 Token::CharData("multi\r\nline\r\ntext".to_string())
510 );
511 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
512 assert_eq!(next_token(&mut lexer), None);
513
514 assert!(Lexer::new("\"multi").next_token().is_err());
515 }
516
517 #[test]
518 fn unicode() {
519 assert_eq!(
520 Lexer::new("♥").next_token().unwrap().unwrap(),
521 Token::CharData("♥".to_string())
522 );
523 }
524
525 #[test]
527 fn lex() {
528 assert_eq!(
529 next_token(&mut Lexer::new(".")).unwrap(),
530 Token::CharData(".".to_string())
531 );
532 assert_eq!(
533 next_token(&mut Lexer::new(" .")).unwrap(),
534 Token::Blank
535 );
536 assert_eq!(
537 next_token(&mut Lexer::new("abc")).unwrap(),
538 Token::CharData("abc".to_string())
539 );
540 assert_eq!(
541 next_token(&mut Lexer::new("abc.")).unwrap(),
542 Token::CharData("abc.".to_string())
543 );
544 assert_eq!(next_token(&mut Lexer::new(";abc")), None);
545 assert_eq!(next_token(&mut Lexer::new(";;@$-\"")), None);
546 assert_eq!(next_token(&mut Lexer::new("@")).unwrap(), Token::At);
547 assert_eq!(
548 next_token(&mut Lexer::new("123")).unwrap(),
549 Token::CharData("123".to_string())
550 );
551 assert_eq!(
552 next_token(&mut Lexer::new("$INCLUDE")).unwrap(),
553 Token::Include
554 );
555 assert_eq!(
556 next_token(&mut Lexer::new("$ORIGIN")).unwrap(),
557 Token::Origin
558 );
559 assert_eq!(next_token(&mut Lexer::new("$TTL")).unwrap(), Token::Ttl);
560 assert_eq!(next_token(&mut Lexer::new("\n")), Some(Token::EOL));
561 assert_eq!(next_token(&mut Lexer::new("\r\n")), Some(Token::EOL));
562 }
563
564 #[test]
565 fn list() {
566 let mut lexer = Lexer::new("(");
567 assert!(lexer.next_token().is_err());
568
569 assert!(Lexer::new(")").next_token().is_err());
570
571 let mut lexer = Lexer::new("()");
572 assert_eq!(next_token(&mut lexer).unwrap(), Token::List(vec![]));
573 assert_eq!(next_token(&mut lexer), None);
574
575 let mut lexer = Lexer::new("(abc)");
576 assert_eq!(
577 next_token(&mut lexer).unwrap(),
578 Token::List(vec!["abc".to_string()])
579 );
580 assert_eq!(next_token(&mut lexer), None);
581
582 let mut lexer = Lexer::new("(\nabc\n)");
583 assert_eq!(
584 next_token(&mut lexer).unwrap(),
585 Token::List(vec!["abc".to_string()])
586 );
587 assert_eq!(next_token(&mut lexer), None);
588
589 let mut lexer = Lexer::new("(\nabc\nabc)");
590 assert_eq!(
591 next_token(&mut lexer).unwrap(),
592 Token::List(vec!["abc".to_string(), "abc".to_string()])
593 );
594 assert_eq!(next_token(&mut lexer), None);
595
596 let mut lexer = Lexer::new("(\nabc;comment\n)");
597 assert_eq!(
598 next_token(&mut lexer).unwrap(),
599 Token::List(vec!["abc".to_string()])
600 );
601 assert_eq!(next_token(&mut lexer), None);
602 }
603
604 #[test]
605 #[allow(clippy::cognitive_complexity)]
606 fn soa() {
607 let mut lexer = Lexer::new(
608 "@ IN SOA VENERA Action\\.domains (
609 \
610 20 ; SERIAL
611 7200 ; REFRESH
612 \
613 600 ; RETRY
614 3600000; EXPIRE
615 \
616 60) ; MINIMUM
617
618 NS A.ISI.EDU.
619 NS VENERA
620 \
621 NS VAXA
622 MX 10 VENERA
623 MX 20 VAXA
624
625\
626 A A 26.3.0.103
627
628VENERA A 10.1.0.52
629 A \
630 128.9.0.32
631
632$INCLUDE <SUBSYS>ISI-MAILBOXES.TXT",
633 );
634
635 assert_eq!(next_token(&mut lexer).unwrap(), Token::At);
636 assert_eq!(
637 next_token(&mut lexer).unwrap(),
638 Token::CharData("IN".to_string())
639 );
640 assert_eq!(
641 next_token(&mut lexer).unwrap(),
642 Token::CharData("SOA".to_string())
643 );
644 assert_eq!(
645 next_token(&mut lexer).unwrap(),
646 Token::CharData("VENERA".to_string())
647 );
648 assert_eq!(
649 next_token(&mut lexer).unwrap(),
650 Token::CharData("Action\\.domains".to_string())
651 );
652 assert_eq!(
653 next_token(&mut lexer).unwrap(),
654 Token::List(vec![
655 "20".to_string(),
656 "7200".to_string(),
657 "600".to_string(),
658 "3600000".to_string(),
659 "60".to_string(),
660 ])
661 );
662 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
663 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
664 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
665 assert_eq!(
666 next_token(&mut lexer).unwrap(),
667 Token::CharData("NS".to_string())
668 );
669 assert_eq!(
670 next_token(&mut lexer).unwrap(),
671 Token::CharData("A.ISI.EDU.".to_string())
672 );
673 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
674 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
675 assert_eq!(
676 next_token(&mut lexer).unwrap(),
677 Token::CharData("NS".to_string())
678 );
679 assert_eq!(
680 next_token(&mut lexer).unwrap(),
681 Token::CharData("VENERA".to_string())
682 );
683 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
684 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
685 assert_eq!(
686 next_token(&mut lexer).unwrap(),
687 Token::CharData("NS".to_string())
688 );
689 assert_eq!(
690 next_token(&mut lexer).unwrap(),
691 Token::CharData("VAXA".to_string())
692 );
693 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
694 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
695 assert_eq!(
696 next_token(&mut lexer).unwrap(),
697 Token::CharData("MX".to_string())
698 );
699 assert_eq!(
700 next_token(&mut lexer).unwrap(),
701 Token::CharData("10".to_string())
702 );
703 assert_eq!(
704 next_token(&mut lexer).unwrap(),
705 Token::CharData("VENERA".to_string())
706 );
707 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
708 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
709 assert_eq!(
710 next_token(&mut lexer).unwrap(),
711 Token::CharData("MX".to_string())
712 );
713 assert_eq!(
714 next_token(&mut lexer).unwrap(),
715 Token::CharData("20".to_string())
716 );
717 assert_eq!(
718 next_token(&mut lexer).unwrap(),
719 Token::CharData("VAXA".to_string())
720 );
721 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
722 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
723 assert_eq!(
724 next_token(&mut lexer).unwrap(),
725 Token::CharData("A".to_string())
726 );
727 assert_eq!(
728 next_token(&mut lexer).unwrap(),
729 Token::CharData("A".to_string())
730 );
731 assert_eq!(
732 next_token(&mut lexer).unwrap(),
733 Token::CharData("26.3.0.103".to_string())
734 );
735 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
736 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
737 assert_eq!(
738 next_token(&mut lexer).unwrap(),
739 Token::CharData("VENERA".to_string())
740 );
741 assert_eq!(
742 next_token(&mut lexer).unwrap(),
743 Token::CharData("A".to_string())
744 );
745 assert_eq!(
746 next_token(&mut lexer).unwrap(),
747 Token::CharData("10.1.0.52".to_string())
748 );
749 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
750 assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
751 assert_eq!(
752 next_token(&mut lexer).unwrap(),
753 Token::CharData("A".to_string())
754 );
755 assert_eq!(
756 next_token(&mut lexer).unwrap(),
757 Token::CharData("128.9.0.32".to_string())
758 );
759 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
760 assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
761 assert_eq!(next_token(&mut lexer).unwrap(), Token::Include);
762 assert_eq!(
763 next_token(&mut lexer).unwrap(),
764 Token::CharData("<SUBSYS>ISI-MAILBOXES.TXT".to_string())
765 );
766 assert!(next_token(&mut lexer).is_none());
767 }
768}