Skip to main content

tor_geoip/
lib.rs

1//! A crate for performing GeoIP lookups using the Tor GeoIP database.
2
3// @@ begin lint list maintained by maint/add_warning @@
4#![allow(renamed_and_removed_lints)] // @@REMOVE_WHEN(ci_arti_stable)
5#![allow(unknown_lints)] // @@REMOVE_WHEN(ci_arti_nightly)
6#![warn(missing_docs)]
7#![warn(noop_method_call)]
8#![warn(unreachable_pub)]
9#![warn(clippy::all)]
10#![deny(clippy::await_holding_lock)]
11#![deny(clippy::cargo_common_metadata)]
12#![deny(clippy::cast_lossless)]
13#![deny(clippy::checked_conversions)]
14#![warn(clippy::cognitive_complexity)]
15#![deny(clippy::debug_assert_with_mut_call)]
16#![deny(clippy::exhaustive_enums)]
17#![deny(clippy::exhaustive_structs)]
18#![deny(clippy::expl_impl_clone_on_copy)]
19#![deny(clippy::fallible_impl_from)]
20#![deny(clippy::implicit_clone)]
21#![deny(clippy::large_stack_arrays)]
22#![warn(clippy::manual_ok_or)]
23#![deny(clippy::missing_docs_in_private_items)]
24#![warn(clippy::needless_borrow)]
25#![warn(clippy::needless_pass_by_value)]
26#![warn(clippy::option_option)]
27#![deny(clippy::print_stderr)]
28#![deny(clippy::print_stdout)]
29#![warn(clippy::rc_buffer)]
30#![deny(clippy::ref_option_ref)]
31#![warn(clippy::semicolon_if_nothing_returned)]
32#![warn(clippy::trait_duplication_in_bounds)]
33#![deny(clippy::unchecked_time_subtraction)]
34#![deny(clippy::unnecessary_wraps)]
35#![warn(clippy::unseparated_literal_suffix)]
36#![deny(clippy::unwrap_used)]
37#![deny(clippy::mod_module_files)]
38#![allow(clippy::let_unit_value)] // This can reasonably be done for explicitness
39#![allow(clippy::uninlined_format_args)]
40#![allow(clippy::significant_drop_in_scrutinee)] // arti/-/merge_requests/588/#note_2812945
41#![allow(clippy::result_large_err)] // temporary workaround for arti#587
42#![allow(clippy::needless_raw_string_hashes)] // complained-about code is fine, often best
43#![allow(clippy::needless_lifetimes)] // See arti#1765
44#![allow(mismatched_lifetime_syntaxes)] // temporary workaround for arti#2060
45#![allow(clippy::collapsible_if)] // See arti#2342
46#![deny(clippy::unused_async)]
47//! <!-- @@ end lint list maintained by maint/add_warning @@ -->
48
49// TODO #1645 (either remove this, or decide to have it everywhere)
50#![cfg_attr(not(all(feature = "full")), allow(unused))]
51
52use crate::dense_range_map::DenseRangeMap;
53pub use crate::err::Error;
54use std::fmt::{Debug, Display, Formatter};
55use std::net::{IpAddr, Ipv6Addr};
56use std::num::{NonZeroU16, NonZeroU32};
57use std::ops::RangeInclusive;
58use std::str::FromStr;
59use std::sync::{Arc, OnceLock};
60
61mod dense_range_map;
62mod err;
63
64/// A parsed copy of the embedded database.
65#[cfg(feature = "embedded-db")]
66static EMBEDDED_DB_PARSED: OnceLock<Arc<GeoipDb>> = OnceLock::new();
67
68/// A two-letter country code.
69///
70/// Specifically, this type represents a purported "ISO 3166-1 alpha-2" country
71/// code, such as "IT" for Italy or "UY" for Uruguay.
72///
73/// It does not include the sentinel value `??` that we use to represent
74/// "country unknown"; if you need that, use [`OptionCc`]. Other than that, we
75/// do not check whether the country code represents a real country: we only
76/// ensure that it is a pair of printing ASCII characters.
77///
78/// Note that the geoip databases included with Arti will only include real
79/// countries; we do not include the pseudo-countries `A1` through `An` for
80/// "anonymous proxies", since doing so would mean putting nearly all Tor relays
81/// into one of those countries.
82#[derive(Copy, Clone, Eq, PartialEq)]
83#[repr(transparent)]
84pub struct CountryCode {
85    /// The underlying value (two printable ASCII characters, stored uppercase).
86    ///
87    /// The special value `??` is excluded, since it is not a country; use
88    /// `OptionCc` instead if you need to represent that.
89    ///
90    /// We store these as `NonZeroU16` so that an `Option<CountryCode>` only has to
91    /// take 2 bytes. This helps with alignment and storage.
92    ///
93    /// (We use a `NonZeroU16` rather than `[NonZeroU8; 2]` to ensure that every
94    /// bit representation is a valid `Option<CountryCode>`.)
95    inner: NonZeroU16,
96}
97
98impl CountryCode {
99    /// Make a new `CountryCode`.
100    fn new(cc_orig: &str) -> Result<Self, Error> {
101        /// Try to convert an array of 2 bytes into a NonZeroU16.
102        #[inline]
103        fn try_cvt_to_nz(inp: [u8; 2]) -> Result<NonZeroU16, Error> {
104            if inp[0] == 0 || inp[1] == 0 {
105                return Err(Error::BadCountryCode("Country code contained NULs".into()));
106            }
107            Ok(u16::from_ne_bytes(inp)
108                .try_into()
109                .expect("zero arrived surprisingly"))
110        }
111
112        let cc = cc_orig.to_ascii_uppercase();
113
114        let cc: [u8; 2] = cc
115            .as_bytes()
116            .try_into()
117            .map_err(|_| Error::BadCountryCode(cc))?;
118
119        if !cc.iter().all(|b| b.is_ascii() && !b.is_ascii_control()) {
120            return Err(Error::BadCountryCode(cc_orig.to_owned()));
121        }
122
123        if &cc == b"??" {
124            return Err(Error::NowhereNotSupported);
125        }
126
127        Ok(Self {
128            inner: try_cvt_to_nz(cc).map_err(|_| Error::BadCountryCode(cc_orig.to_owned()))?,
129        })
130    }
131
132    /// Get the actual country code.
133    ///
134    /// This just calls `.as_ref()`.
135    pub fn get(&self) -> &str {
136        self.as_ref()
137    }
138}
139
140impl Display for CountryCode {
141    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
142        write!(f, "{}", self.as_ref())
143    }
144}
145
146impl Debug for CountryCode {
147    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
148        write!(f, "CountryCode(\"{}\")", self.as_ref())
149    }
150}
151
152impl AsRef<str> for CountryCode {
153    fn as_ref(&self) -> &str {
154        /// Convert a reference to a NonZeroU16 to a reference to
155        /// an array of 2 bytes.
156        #[inline]
157        fn cvt_ref(inp: &NonZeroU16) -> &[u8; 2] {
158            // SAFETY: Every NonZeroU16 has a layout, alignment, and bit validity that is
159            // also a valid [u8; 2].  The layout of arrays is also guaranteed.
160            //
161            // (We don't use try_into here because we need to return a str that
162            // points to a reference to self.)
163            let slice: &[NonZeroU16] = std::slice::from_ref(inp);
164            let (_, slice, _) = unsafe { slice.align_to::<u8>() };
165            slice
166                .try_into()
167                .expect("the resulting slice should have the correct length!")
168        }
169
170        // This shouldn't ever panic, since we shouldn't feed non-utf8 country
171        // codes in.
172        //
173        // In theory we could use from_utf8_unchecked, but that's probably not
174        // needed.
175        std::str::from_utf8(cvt_ref(&self.inner)).expect("invalid country code in CountryCode")
176    }
177}
178
179impl FromStr for CountryCode {
180    type Err = Error;
181
182    fn from_str(s: &str) -> Result<Self, Self::Err> {
183        CountryCode::new(s)
184    }
185}
186
187/// Wrapper for an `Option<`[`CountryCode`]`>` that encodes `None` as `??`.
188///
189/// Used so that we can implement foreign traits.
190#[derive(
191    Copy, Clone, Debug, Eq, PartialEq, derive_more::Into, derive_more::From, derive_more::AsRef,
192)]
193#[allow(clippy::exhaustive_structs)]
194pub struct OptionCc(pub Option<CountryCode>);
195
196impl FromStr for OptionCc {
197    type Err = Error;
198
199    fn from_str(s: &str) -> Result<Self, Self::Err> {
200        match CountryCode::new(s) {
201            Err(Error::NowhereNotSupported) => Ok(None.into()),
202            Err(e) => Err(e),
203            Ok(cc) => Ok(Some(cc).into()),
204        }
205    }
206}
207
208impl Display for OptionCc {
209    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
210        match self.0 {
211            Some(cc) => write!(f, "{}", cc),
212            None => write!(f, "??"),
213        }
214    }
215}
216
217/// The type of an ASN.
218type Asn = NonZeroU32;
219
220/// A database of IP addresses to country codes.
221#[derive(Clone, Eq, PartialEq, Debug)]
222pub struct GeoipDb {
223    /// The IPv4 subset of the database, with v4 addresses stored as 32-bit integers.
224    map_v4: DenseRangeMap<u32, CountryCode, Asn>,
225    /// The IPv6 subset of the database, with v6 addresses stored as 128-bit integers.
226    map_v6: DenseRangeMap<u128, CountryCode, Asn>,
227}
228
229impl GeoipDb {
230    /// Make a new `GeoipDb` using a compiled-in copy of the GeoIP database.
231    ///
232    /// The returned instance of the database is shared with `Arc` across all invocations of this
233    /// function in the same program.
234    #[cfg(feature = "embedded-db")]
235    pub fn new_embedded() -> Arc<Self> {
236        Arc::clone(EMBEDDED_DB_PARSED.get_or_init(|| {
237            use tor_geoip_db as db;
238            fn cvt_ccs(ccs: &'static [Option<NonZeroU16>]) -> &'static [Option<CountryCode>] {
239                // SAFETY: CountryCode is a repr(transparent) for NonZeroU16.
240                let (pre, data, post) = unsafe { ccs.align_to::<Option<CountryCode>>() };
241                assert!(pre.is_empty());
242                assert!(post.is_empty());
243                data
244            }
245
246            let map_v4 = DenseRangeMap::from_static_parts(db::ipv4s(), cvt_ccs(db::ipv4c()), None);
247            let map_v6 = DenseRangeMap::from_static_parts(db::ipv6s(), cvt_ccs(db::ipv6c()), None);
248
249            Arc::new(
250                // It's reasonable to assume the one we embedded is fine --
251                // we'll test it in CI, etc.
252                GeoipDb { map_v4, map_v6 },
253            )
254        }))
255    }
256
257    /// Make a new `GeoipDb` using provided copies of the v4 and v6 database, in Tor legacy format.
258    pub fn new_from_legacy_format(
259        db_v4: &str,
260        db_v6: &str,
261        include_asn: bool,
262    ) -> Result<Self, Error> {
263        let discard_asn = !include_asn;
264        let map_v4 = DenseRangeMap::try_from_sorted_inclusive_ranges(
265            db_v4
266                .lines()
267                .filter_map(|line| parse_line::<u32>(line).transpose()),
268            discard_asn,
269        )?;
270
271        let map_v6 = DenseRangeMap::try_from_sorted_inclusive_ranges(
272            db_v6
273                .lines()
274                .filter_map(|line| parse_line::<Ipv6Addr>(line).transpose()),
275            discard_asn,
276        )?;
277
278        Ok(Self { map_v4, map_v6 })
279    }
280
281    /// Return the database in a raw format suitable for embedding.
282    ///
283    /// This method and the format it returns are unstable.
284    /// This method should only be used for maintaining the database.
285    #[cfg(feature = "export")]
286    #[allow(clippy::type_complexity)]
287    pub fn export_raw(&self) -> RawGeoipDbExport {
288        let (ipv4_starts, ipv4_ccs, ipv4_asns) = self.map_v4.export();
289        let (ipv6_starts, ipv6_ccs, ipv6_asns) = self.map_v6.export();
290
291        RawGeoipDbExport {
292            ipv4_starts,
293            ipv4_ccs,
294            ipv4_asns,
295            ipv6_starts,
296            ipv6_ccs,
297            ipv6_asns,
298        }
299    }
300
301    /// Get a 2-letter country code for the given IP address, if this data is available.
302    pub fn lookup_country_code(&self, ip: IpAddr) -> Option<&CountryCode> {
303        match ip {
304            IpAddr::V4(v4) => self.map_v4.get1(&v4.into()),
305            IpAddr::V6(v6) => self.map_v6.get1(&v6.into()),
306        }
307    }
308
309    /// Determine a 2-letter country code for a host with multiple IP addresses.
310    ///
311    /// This looks up all of the IP addresses with `lookup_country_code`. If the lookups
312    /// return different countries, `None` is returned. IP addresses that fail to resolve
313    /// into a country are ignored if some of the other addresses do resolve successfully.
314    pub fn lookup_country_code_multi<I>(&self, ips: I) -> Option<&CountryCode>
315    where
316        I: IntoIterator<Item = IpAddr>,
317    {
318        let mut ret = None;
319
320        for ip in ips {
321            if let Some(cc) = self.lookup_country_code(ip) {
322                // If we already have a return value and it's different, then return None;
323                // a server can't be in two different countries.
324                if ret.is_some() && ret != Some(cc) {
325                    return None;
326                }
327
328                ret = Some(cc);
329            }
330        }
331
332        ret
333    }
334
335    /// Return the ASN the IP address is in, if this data is available.
336    pub fn lookup_asn(&self, ip: IpAddr) -> Option<u32> {
337        let cc = match ip {
338            IpAddr::V4(v4) => self.map_v4.get2(&v4.into()),
339            IpAddr::V6(v6) => self.map_v6.get2(&v6.into()),
340        };
341        cc.map(|nz| nz.get())
342    }
343}
344
345/// A type that can be an address entry in one of our databases.
346trait DbAddress: FromStr {
347    /// The integer that we use to represent this kind of address.
348    type Int;
349
350    /// Convert this address to an integer.
351    fn to_int(&self) -> Self::Int;
352}
353
354impl DbAddress for u32 {
355    type Int = u32;
356
357    fn to_int(&self) -> Self::Int {
358        *self
359    }
360}
361
362impl DbAddress for Ipv6Addr {
363    type Int = u128;
364
365    fn to_int(&self) -> Self::Int {
366        (*self).into()
367    }
368}
369
370/// A line as returned by [`parse_line`].
371type ParsedLine<T> = (RangeInclusive<T>, Option<CountryCode>, Option<Asn>);
372
373/// Parse a single line from a database, expecting addresses of type T.
374///
375/// Return Ok(None) if the line is empty.
376fn parse_line<T: DbAddress>(line: &str) -> Result<Option<ParsedLine<T::Int>>, Error>
377where
378    Error: From<<T as FromStr>::Err>,
379{
380    if line.starts_with('#') {
381        return Ok(None);
382    }
383    let line = line.trim();
384    if line.is_empty() {
385        return Ok(None);
386    }
387
388    let mut split = line.split(',');
389    let from = split
390        .next()
391        .ok_or(Error::BadFormat("empty line somehow?".into()))?
392        .parse::<T>()?
393        .to_int();
394    let to = split
395        .next()
396        .ok_or(Error::BadFormat("line with insufficient commas".into()))?
397        .parse::<T>()?
398        .to_int();
399    let cc = split
400        .next()
401        .ok_or(Error::BadFormat("line with insufficient commas".into()))?;
402    let cc = match cc {
403        "" => None,
404        cc => OptionCc::from_str(cc)?.0,
405    };
406    let asn = split.next().map(|x| x.parse::<u32>()).transpose()?;
407    // Treat "0" as "no asn".
408    let asn = asn.map(NonZeroU32::try_from).transpose().ok().flatten();
409
410    Ok(Some((from..=to, cc, asn)))
411}
412
413/// A (representation of a) host on the network which may have a known country code.
414pub trait HasCountryCode {
415    /// Return the country code in which this server is most likely located.
416    ///
417    /// This is usually implemented by simple GeoIP lookup on the addresses provided by `HasAddrs`.
418    /// It follows that the server might not actually be in the returned country, but this is a
419    /// halfway decent estimate for what other servers might guess the server's location to be
420    /// (and thus useful for e.g. getting around simple geo-blocks, or having webpages return
421    /// the correct localised versions).
422    ///
423    /// Returning `None` signifies that no country code information is available. (Conflicting
424    /// GeoIP lookup results might also cause `None` to be returned.)
425    fn country_code(&self) -> Option<CountryCode>;
426}
427
428/// An export of a GeoIp database in a raw format suitable for embedding.
429///
430/// This format is deliberately undocumented, and not for other uses.
431#[cfg(feature = "export")]
432#[allow(clippy::exhaustive_structs, missing_docs)]
433pub struct RawGeoipDbExport<'a> {
434    pub ipv4_starts: &'a [u32],
435    pub ipv4_ccs: &'a [Option<CountryCode>],
436    pub ipv4_asns: Option<&'a [Option<NonZeroU32>]>,
437    pub ipv6_starts: &'a [u128],
438    pub ipv6_ccs: &'a [Option<CountryCode>],
439    pub ipv6_asns: Option<&'a [Option<NonZeroU32>]>,
440}
441
442#[cfg(feature = "export")]
443impl<'a> RawGeoipDbExport<'a> {
444    /// Save the contents of this export into a set of data files in "Path".
445    pub fn save(&self, path: &std::path::Path) -> std::io::Result<()> {
446        use std::fs::write;
447        fn into_bytes<'a, T>(data: &'a [T]) -> &'a [u8] {
448            // SAFETY: Every possible bit sequence is a valid u8.
449            let (pre, data, post) = unsafe { data.align_to::<u8>() };
450            assert!(pre.is_empty());
451            assert!(post.is_empty());
452            data
453        }
454        write(path.join("geoip_data_v4s"), into_bytes(self.ipv4_starts))?;
455        write(path.join("geoip_data_v4c"), into_bytes(self.ipv4_ccs))?;
456        if let Some(asns) = self.ipv4_asns {
457            write(path.join("geoip_data_v4a"), into_bytes(asns))?;
458        }
459        write(path.join("geoip_data_v6s"), into_bytes(self.ipv6_starts))?;
460        write(path.join("geoip_data_v6c"), into_bytes(self.ipv6_ccs))?;
461        if let Some(asns) = self.ipv6_asns {
462            write(path.join("geoip_data_v6a"), into_bytes(asns))?;
463        }
464        Ok(())
465    }
466}
467
468#[cfg(test)]
469mod test {
470    // @@ begin test lint list maintained by maint/add_warning @@
471    #![allow(clippy::bool_assert_comparison)]
472    #![allow(clippy::clone_on_copy)]
473    #![allow(clippy::dbg_macro)]
474    #![allow(clippy::mixed_attributes_style)]
475    #![allow(clippy::print_stderr)]
476    #![allow(clippy::print_stdout)]
477    #![allow(clippy::single_char_pattern)]
478    #![allow(clippy::unwrap_used)]
479    #![allow(clippy::unchecked_time_subtraction)]
480    #![allow(clippy::useless_vec)]
481    #![allow(clippy::needless_pass_by_value)]
482    //! <!-- @@ end test lint list maintained by maint/add_warning @@ -->
483
484    use super::*;
485    use std::net::Ipv4Addr;
486
487    // NOTE(eta): this test takes a whole 1.6 seconds in *non-release* mode
488    #[test]
489    #[cfg(feature = "embedded-db")]
490    fn embedded_db() {
491        let db = GeoipDb::new_embedded();
492
493        assert_eq!(
494            db.lookup_country_code(Ipv4Addr::new(8, 8, 8, 8).into())
495                .map(|x| x.as_ref()),
496            Some("US")
497        );
498
499        assert_eq!(
500            db.lookup_country_code("2001:4860:4860::8888".parse().unwrap())
501                .map(|x| x.as_ref()),
502            Some("US")
503        );
504    }
505
506    #[test]
507    fn cc_rep() {
508        let italy = CountryCode::new("IT").unwrap();
509        assert_eq!(italy.as_ref(), "IT");
510    }
511
512    #[test]
513    fn basic_lookups() {
514        let src_v4 = r#"
515        16909056,16909311,GB
516        "#;
517        let src_v6 = r#"
518        dead:beef::,dead:ffff::,??
519        fe80::,fe81::,US
520        "#;
521        let db = GeoipDb::new_from_legacy_format(src_v4, src_v6, true).unwrap();
522
523        assert_eq!(
524            db.lookup_country_code(Ipv4Addr::new(1, 2, 3, 4).into())
525                .map(|x| x.as_ref()),
526            Some("GB")
527        );
528
529        assert_eq!(
530            db.lookup_country_code(Ipv4Addr::new(1, 1, 1, 1).into()),
531            None
532        );
533
534        assert_eq!(
535            db.lookup_country_code("fe80::dead:beef".parse().unwrap())
536                .map(|x| x.as_ref()),
537            Some("US")
538        );
539
540        assert_eq!(
541            db.lookup_country_code("fe81::dead:beef".parse().unwrap()),
542            None
543        );
544        assert_eq!(
545            db.lookup_country_code("dead:beef::1".parse().unwrap()),
546            None
547        );
548    }
549
550    #[test]
551    fn cc_parse() -> Result<(), Error> {
552        // real countries.
553        assert_eq!(CountryCode::from_str("us")?, CountryCode::from_str("US")?);
554        assert_eq!(CountryCode::from_str("UY")?, CountryCode::from_str("UY")?);
555
556        // not real as of this writing, but still representable.
557        assert_eq!(CountryCode::from_str("A7")?, CountryCode::from_str("a7")?);
558        assert_eq!(CountryCode::from_str("xz")?, CountryCode::from_str("xz")?);
559
560        // Can't convert to two bytes.
561        assert!(matches!(
562            CountryCode::from_str("z"),
563            Err(Error::BadCountryCode(_))
564        ));
565        assert!(matches!(
566            CountryCode::from_str("🐻‍❄️"),
567            Err(Error::BadCountryCode(_))
568        ));
569        assert!(matches!(
570            CountryCode::from_str("Sheboygan"),
571            Err(Error::BadCountryCode(_))
572        ));
573
574        // Can convert to two bytes, but still not printable ascii
575        assert!(matches!(
576            CountryCode::from_str("\r\n"),
577            Err(Error::BadCountryCode(_))
578        ));
579        assert!(matches!(
580            CountryCode::from_str("\0\0"),
581            Err(Error::BadCountryCode(_))
582        ));
583        assert!(matches!(
584            CountryCode::from_str("¡"),
585            Err(Error::BadCountryCode(_))
586        ));
587
588        // Not a country.
589        assert!(matches!(
590            CountryCode::from_str("??"),
591            Err(Error::NowhereNotSupported)
592        ));
593
594        Ok(())
595    }
596
597    #[test]
598    fn opt_cc_parse() -> Result<(), Error> {
599        assert_eq!(
600            CountryCode::from_str("br")?,
601            OptionCc::from_str("BR")?.0.unwrap()
602        );
603        assert!(OptionCc::from_str("??")?.0.is_none());
604
605        Ok(())
606    }
607}