jiff/tz/
tzif.rs

1/*!
2This module provides support for TZif binary files from the [Time Zone
3Database].
4
5These binary files are the ones commonly found in Unix distributions in the
6`/usr/share/zoneinfo` directory.
7
8[Time Zone Database]: https://www.iana.org/time-zones
9*/
10
11use core::ops::Range;
12
13use alloc::{string::String, vec, vec::Vec};
14
15use crate::{
16    civil::DateTime,
17    error::{err, Error, ErrorContext},
18    timestamp::Timestamp,
19    tz::{
20        posix::{IanaTz, ReasonablePosixTimeZone},
21        AmbiguousOffset, Dst, Offset, TimeZoneAbbreviation,
22        TimeZoneOffsetInfo, TimeZoneTransition,
23    },
24    util::{
25        crc32,
26        escape::{Byte, Bytes},
27        t::UnixSeconds,
28    },
29};
30
31/// A time zone based on IANA TZif formatted data.
32///
33/// TZif is a binary format described by RFC 8536. Its typical structure is to
34/// define a single time zone per file in the `/usr/share/zoneinfo` directory
35/// on Unix systems. The name of a time zone is its file path with the
36/// `/usr/share/zoneinfo/` prefix stripped from it.
37///
38/// This type doesn't provide any facilities for dealing with files on disk
39/// or the `/usr/share/zoneinfo` directory. This type is just for parsing the
40/// contents of TZif formatted data in memory, and turning it into a data type
41/// that can be used as a time zone.
42#[derive(Debug)]
43pub(crate) struct Tzif {
44    name: Option<String>,
45    /// An ASCII byte corresponding to the version number. So, 0x50 is '2'.
46    ///
47    /// This is unused. It's only used in `test` compilation for emitting
48    /// diagnostic data about TZif files. If we really need to use this, we
49    /// should probably just convert it to an actual integer.
50    #[allow(dead_code)]
51    version: u8,
52    checksum: u32,
53    transitions: Vec<Transition>,
54    types: Vec<LocalTimeType>,
55    designations: String,
56    leap_seconds: Vec<LeapSecond>,
57    posix_tz: Option<ReasonablePosixTimeZone>,
58}
59
60impl Tzif {
61    /// Parses the given data as a TZif formatted file.
62    ///
63    /// The name given is attached to the `Tzif` value returned, but is
64    /// otherwise not significant.
65    ///
66    /// If the given data is not recognized to be valid TZif, then an error is
67    /// returned.
68    ///
69    /// In general, callers may assume that it is safe to pass arbitrary or
70    /// even untrusted data to this function and count on it not panicking
71    /// or using resources that aren't limited to a small constant factor of
72    /// the size of the data itself. That is, callers can reliably limit the
73    /// resources used by limiting the size of the data given to this parse
74    /// function.
75    pub(crate) fn parse(
76        name: Option<String>,
77        bytes: &[u8],
78    ) -> Result<Tzif, Error> {
79        let original = bytes;
80        let name = name.into();
81        let (header32, rest) = Header::parse(4, bytes)
82            .map_err(|e| e.context("failed to parse 32-bit header"))?;
83        let (mut tzif, rest) = if header32.version == 0 {
84            Tzif::parse32(name, header32, rest)?
85        } else {
86            Tzif::parse64(name, header32, rest)?
87        };
88        // Compute the checksum using the entire contents of the TZif data.
89        let tzif_raw_len = (rest.as_ptr() as usize)
90            .checked_sub(original.as_ptr() as usize)
91            .unwrap();
92        let tzif_raw_bytes = &original[..tzif_raw_len];
93        tzif.checksum = crc32::sum(tzif_raw_bytes);
94        Ok(tzif)
95    }
96
97    /// Returns the name given to this TZif data in its constructor.
98    pub(crate) fn name(&self) -> Option<&str> {
99        self.name.as_deref()
100    }
101
102    /// Returns the appropriate time zone offset to use for the given
103    /// timestamp.
104    pub(crate) fn to_offset(&self, timestamp: Timestamp) -> Offset {
105        match self.to_local_time_type(timestamp) {
106            Ok(typ) => typ.offset,
107            Err(tz) => tz.to_offset(timestamp),
108        }
109    }
110
111    /// Returns the appropriate time zone offset to use for the given
112    /// timestamp.
113    ///
114    /// This also includes whether the offset returned should be considered to
115    /// be DST or not, along with the time zone abbreviation (e.g., EST for
116    /// standard time in New York, and EDT for DST in New York).
117    pub(crate) fn to_offset_info(
118        &self,
119        timestamp: Timestamp,
120    ) -> TimeZoneOffsetInfo<'_> {
121        let typ = match self.to_local_time_type(timestamp) {
122            Ok(typ) => typ,
123            Err(tz) => return tz.to_offset_info(timestamp),
124        };
125        let abbreviation =
126            TimeZoneAbbreviation::Borrowed(self.designation(typ));
127        TimeZoneOffsetInfo {
128            offset: typ.offset,
129            dst: typ.is_dst,
130            abbreviation,
131        }
132    }
133
134    /// Returns the local time type for the timestamp given.
135    ///
136    /// If one could not be found, then this implies that the caller should
137    /// use the POSIX time zone returned in the error variant.
138    fn to_local_time_type(
139        &self,
140        timestamp: Timestamp,
141    ) -> Result<&LocalTimeType, &ReasonablePosixTimeZone> {
142        // This is guaranteed because we always push at least one transition.
143        // This isn't guaranteed by TZif since it might have 0 transitions,
144        // but we always add a "dummy" first transition with our minimum
145        // `Timestamp` value. TZif doesn't do this because there is no
146        // universal minimum timestamp. (`i64::MIN` is a candidate, but that's
147        // likely to cause overflow in readers that don't do error checking.)
148        //
149        // The result of the dummy transition is that the code below is simpler
150        // with fewer special cases.
151        assert!(!self.transitions.is_empty(), "transitions is non-empty");
152        let index = if timestamp > self.transitions.last().unwrap().timestamp {
153            self.transitions.len() - 1
154        } else {
155            let search = self
156                .transitions
157                // It is an optimization to compare only by the second instead
158                // of the second and the nanosecond. This works for two
159                // reasons. Firstly, the timestamps in TZif are limited to
160                // second precision. Secondly, this may result in two
161                // timestamps comparing equal when they would otherwise be
162                // unequal (for example, when a timestamp given falls on a
163                // transition, but has non-zero fractional seconds). But this
164                // is okay, because it would otherwise get an `Err(i)`, and
165                // access `i-1`. i.e., The timestamp it compared equal to.
166                .binary_search_by_key(&timestamp.as_second(), |t| {
167                    t.timestamp.as_second()
168                });
169            match search {
170                // Since the first transition is always Timestamp::MIN, it's
171                // impossible for any timestamp to sort before it.
172                Err(0) => {
173                    unreachable!("impossible to come before Timestamp::MIN")
174                }
175                Ok(i) => i,
176                // i points to the position immediately after the matching
177                // timestamp. And since we know that i>0 because of the i==0
178                // check above, we can safely subtract 1.
179                Err(i) => i.checked_sub(1).expect("i is non-zero"),
180            }
181        };
182        // Our index is always in bounds. The only way it couldn't be is if
183        // binary search returns an Err(len) for a time greater than the
184        // maximum transition. But we account for that above by converting
185        // Err(len) to Err(len-1).
186        assert!(index < self.transitions.len());
187        // RFC 8536 says: "Local time for timestamps on or after the last
188        // transition is specified by the TZ string in the footer (Section 3.3)
189        // if present and nonempty; otherwise, it is unspecified."
190        //
191        // Subtracting 1 is OK because we know self.transitions is not empty.
192        let t = if index < self.transitions.len() - 1 {
193            // This is the typical case in "fat" TZif files: we found a
194            // matching transition.
195            &self.transitions[index]
196        } else {
197            match self.posix_tz.as_ref() {
198                // This is the typical case in "slim" TZif files, where the
199                // last transition is, as I understand it, the transition at
200                // which a consistent rule started that a POSIX TZ string can
201                // fully describe. For example, (as of 2024-03-27) the last
202                // transition in the "fat" America/New_York TZif file is
203                // in 2037, where as in the "slim" version it is 2007.
204                //
205                // This is likely why some things break with the "slim"
206                // version: they don't support POSIX TZ strings (or don't
207                // support them correctly).
208                Some(tz) => return Err(tz),
209                // This case is technically unspecified, but I think the
210                // typical thing to do is to just use the last transition.
211                // I'm not 100% sure on this one.
212                None => &self.transitions[index],
213            }
214        };
215        Ok(self.local_time_type(t))
216    }
217
218    /// Returns a possibly ambiguous timestamp for the given civil datetime.
219    ///
220    /// The given datetime should correspond to the "wall" clock time of what
221    /// humans use to tell time for this time zone.
222    ///
223    /// Note that "ambiguous timestamp" is represented by the possible
224    /// selection of offsets that could be applied to the given datetime. In
225    /// general, it is only ambiguous around transitions to-and-from DST. The
226    /// ambiguity can arise as a "fold" (when a particular wall clock time is
227    /// repeated) or as a "gap" (when a particular wall clock time is skipped
228    /// entirely).
229    pub(crate) fn to_ambiguous_kind(&self, dt: DateTime) -> AmbiguousOffset {
230        // This implementation very nearly mirrors `to_offset` above in the
231        // beginning: we do a binary search to find transition applicable for
232        // the given datetime. Except, we do it on wall clock times instead
233        // of timestamps. And in particular, each transition begins with a
234        // possibly ambiguous range of wall clock times corresponding to either
235        // a "gap" or "fold" in time.
236        assert!(!self.transitions.is_empty(), "transitions is non-empty");
237        let search =
238            self.transitions.binary_search_by_key(&dt, |t| t.wall.start());
239        let this_index = match search {
240            Err(0) => unreachable!("impossible to come before DateTime::MIN"),
241            Ok(i) => i,
242            Err(i) => i.checked_sub(1).expect("i is non-zero"),
243        };
244        assert!(this_index < self.transitions.len());
245
246        let this = &self.transitions[this_index];
247        let this_offset = self.local_time_type(this).offset;
248        // This is a little tricky, but we need to check for ambiguous civil
249        // datetimes before possibly using the POSIX TZ string. Namely, a
250        // datetime could be ambiguous with respect to the last transition,
251        // and we should handle that according to the gap/fold determined for
252        // that transition. We cover this case in tests in tz/mod.rs for the
253        // Pacific/Honolulu time zone, whose last transition begins with a gap.
254        match this.wall {
255            TransitionWall::Gap { end, .. } if dt < end => {
256                // A gap/fold can only appear when there exists a previous
257                // transition.
258                let prev_index = this_index.checked_sub(1).unwrap();
259                let prev = &self.transitions[prev_index];
260                let prev_offset = self.local_time_type(prev).offset;
261                return AmbiguousOffset::Gap {
262                    before: prev_offset,
263                    after: this_offset,
264                };
265            }
266            TransitionWall::Fold { end, .. } if dt < end => {
267                // A gap/fold can only appear when there exists a previous
268                // transition.
269                let prev_index = this_index.checked_sub(1).unwrap();
270                let prev = &self.transitions[prev_index];
271                let prev_offset = self.local_time_type(prev).offset;
272                return AmbiguousOffset::Fold {
273                    before: prev_offset,
274                    after: this_offset,
275                };
276            }
277            _ => {}
278        }
279        // The datetime given is not ambiguous with respect to any of the
280        // transitions in the TZif data. But, if we matched at or after the
281        // last transition, then we need to use the POSIX TZ string (which
282        // could still return an ambiguous offset).
283        if this_index == self.transitions.len() - 1 {
284            if let Some(tz) = self.posix_tz.as_ref() {
285                return tz.to_ambiguous_kind(dt);
286            }
287            // This case is unspecified according to RFC 8536. It means that
288            // the given datetime exceeds all transitions *and* there is no
289            // POSIX TZ string. So this can happen in V1 files for example.
290            // But those should hopefully be essentially non-existent nowadays
291            // (2024-03). In any case, we just fall through to using the last
292            // transition, which does seem likely to be wrong ~half the time
293            // in time zones with DST. But there really isn't much else we can
294            // do I think.
295        }
296        AmbiguousOffset::Unambiguous { offset: this_offset }
297    }
298
299    /// Returns the timestamp of the most recent time zone transition prior
300    /// to the timestamp given. If one doesn't exist, `None` is returned.
301    pub(crate) fn previous_transition(
302        &self,
303        ts: Timestamp,
304    ) -> Option<TimeZoneTransition> {
305        assert!(!self.transitions.is_empty(), "transitions is non-empty");
306        let search =
307            self.transitions.binary_search_by_key(&ts, |t| t.timestamp);
308        let index = match search {
309            Ok(i) | Err(i) => i.checked_sub(1)?,
310        };
311        let trans = if index == 0 {
312            // The first transition is a dummy that we insert, so if we land on
313            // it here, treat it as if it doesn't exist.
314            return None;
315        } else if index == self.transitions.len() - 1 {
316            if let Some(ref posix_tz) = self.posix_tz {
317                // Since the POSIX TZ must be consistent with the last
318                // transition, it must be the case that tzif_last <=
319                // posix_prev_trans in all cases. So the transition according
320                // to the POSIX TZ is always correct here.
321                //
322                // What if this returns `None` though? I'm not sure in which
323                // cases that could matter, and I think it might be a violation
324                // of the TZif format if it does.
325                return posix_tz.previous_transition(ts);
326            }
327            &self.transitions[index]
328        } else {
329            &self.transitions[index]
330        };
331        let typ = &self.types[usize::from(trans.type_index)];
332        Some(TimeZoneTransition {
333            timestamp: trans.timestamp,
334            offset: typ.offset,
335            abbrev: self.designation(typ),
336            dst: typ.is_dst,
337        })
338    }
339
340    /// Returns the timestamp of the soonest time zone transition after the
341    /// timestamp given. If one doesn't exist, `None` is returned.
342    pub(crate) fn next_transition(
343        &self,
344        ts: Timestamp,
345    ) -> Option<TimeZoneTransition> {
346        assert!(!self.transitions.is_empty(), "transitions is non-empty");
347        let search =
348            self.transitions.binary_search_by_key(&ts, |t| t.timestamp);
349        let index = match search {
350            Ok(i) => i.checked_add(1)?,
351            Err(i) => i,
352        };
353        let trans = if index == 0 {
354            // The first transition is a dummy that we insert, so if we land on
355            // it here, treat it as if it doesn't exist.
356            return None;
357        } else if index >= self.transitions.len() - 1 {
358            if let Some(ref posix_tz) = self.posix_tz {
359                // Since the POSIX TZ must be consistent with the last
360                // transition, it must be the case that next.timestamp <=
361                // posix_next_tans in all cases. So the transition according to
362                // the POSIX TZ is always correct here.
363                //
364                // What if this returns `None` though? I'm not sure in which
365                // cases that could matter, and I think it might be a violation
366                // of the TZif format if it does.
367                return posix_tz.next_transition(ts);
368            }
369            self.transitions.last().expect("last transition")
370        } else {
371            &self.transitions[index]
372        };
373        let typ = &self.types[usize::from(trans.type_index)];
374        Some(TimeZoneTransition {
375            timestamp: trans.timestamp,
376            offset: typ.offset,
377            abbrev: self.designation(typ),
378            dst: typ.is_dst,
379        })
380    }
381
382    fn designation(&self, typ: &LocalTimeType) -> &str {
383        // OK because we verify that the designation range on every local
384        // time type is a valid range into `self.designations`.
385        &self.designations[typ.designation()]
386    }
387
388    fn local_time_type(&self, transition: &Transition) -> &LocalTimeType {
389        // OK because we require that `type_index` always points to a valid
390        // local time type.
391        &self.types[usize::from(transition.type_index)]
392    }
393
394    fn first_transition(&self) -> &Transition {
395        // OK because we know we have at least one transition. This isn't
396        // true generally of the TZif format, since it does actually permit 0
397        // transitions. But as part of parsing, we always add a "dummy" first
398        // transition corresponding to the minimum possible Jiff timestamp.
399        // This makes some logic for transition lookups a little simpler by
400        // reducing special cases.
401        self.transitions.first().unwrap()
402    }
403
404    fn parse32<'b>(
405        name: Option<String>,
406        header32: Header,
407        bytes: &'b [u8],
408    ) -> Result<(Tzif, &'b [u8]), Error> {
409        let mut tzif = Tzif {
410            name,
411            version: header32.version,
412            // filled in later
413            checksum: 0,
414            transitions: vec![],
415            types: vec![],
416            designations: String::new(),
417            leap_seconds: vec![],
418            posix_tz: None,
419        };
420        let rest = tzif.parse_transitions(&header32, bytes)?;
421        let rest = tzif.parse_transition_types(&header32, rest)?;
422        let rest = tzif.parse_local_time_types(&header32, rest)?;
423        let rest = tzif.parse_time_zone_designations(&header32, rest)?;
424        let rest = tzif.parse_leap_seconds(&header32, rest)?;
425        let rest = tzif.parse_indicators(&header32, rest)?;
426        tzif.set_wall_datetimes();
427        Ok((tzif, rest))
428    }
429
430    fn parse64<'b>(
431        name: Option<String>,
432        header32: Header,
433        bytes: &'b [u8],
434    ) -> Result<(Tzif, &'b [u8]), Error> {
435        let (_, rest) = try_split_at(
436            "V1 TZif data block",
437            bytes,
438            header32.data_block_len()?,
439        )?;
440        let (header64, rest) = Header::parse(8, rest)
441            .map_err(|e| e.context("failed to parse 64-bit header"))?;
442        let mut tzif = Tzif {
443            name,
444            version: header64.version,
445            // filled in later
446            checksum: 0,
447            transitions: vec![],
448            types: vec![],
449            designations: String::new(),
450            leap_seconds: vec![],
451            posix_tz: None,
452        };
453        let rest = tzif.parse_transitions(&header64, rest)?;
454        let rest = tzif.parse_transition_types(&header64, rest)?;
455        let rest = tzif.parse_local_time_types(&header64, rest)?;
456        let rest = tzif.parse_time_zone_designations(&header64, rest)?;
457        let rest = tzif.parse_leap_seconds(&header64, rest)?;
458        let rest = tzif.parse_indicators(&header64, rest)?;
459        let rest = tzif.parse_footer(&header64, rest)?;
460        // Validates that the POSIX TZ string we parsed (if one exists) is
461        // consistent with the last transition in this time zone. This is
462        // required by RFC 8536.
463        //
464        // RFC 8536 says, "If the string is nonempty and one or more
465        // transitions appear in the version 2+ data, the string MUST be
466        // consistent with the last version 2+ transition."
467        //
468        // We need to be a little careful, since we always have at least one
469        // transition (accounting for the dummy `Timestamp::MIN` transition).
470        // So if we only have 1 transition and a POSIX TZ string, then we
471        // should not validate it since it's equivalent to the case of 0
472        // transitions and a POSIX TZ string.
473        if tzif.transitions.len() > 1 {
474            if let Some(ref tz) = tzif.posix_tz {
475                let last = tzif.transitions.last().expect("last transition");
476                let typ = tzif.local_time_type(last);
477                let info = tz.to_offset_info(last.timestamp);
478                if info.offset() != typ.offset {
479                    return Err(err!(
480                        "expected last transition to have DST offset \
481                         of {}, but got {} according to POSIX TZ \
482                         string {}",
483                        typ.offset,
484                        info.offset(),
485                        tz,
486                    ));
487                }
488                if info.dst() != typ.is_dst {
489                    return Err(err!(
490                        "expected last transition to have is_dst={}, \
491                         but got is_dst={} according to POSIX TZ \
492                         string {}",
493                        typ.is_dst.is_dst(),
494                        info.dst().is_dst(),
495                        tz,
496                    ));
497                }
498                if info.abbreviation() != tzif.designation(&typ) {
499                    return Err(err!(
500                        "expected last transition to have \
501                         designation={}, \
502                         but got designation={} according to POSIX TZ \
503                         string {}",
504                        info.abbreviation(),
505                        tzif.designation(&typ),
506                        tz,
507                    ));
508                }
509            }
510        }
511        tzif.set_wall_datetimes();
512        // N.B. We don't check that the TZif data is fully valid. It
513        // is possible for it to contain superfluous information. For
514        // example, a non-zero local time type that is never referenced
515        // by a transition.
516        Ok((tzif, rest))
517    }
518
519    fn parse_transitions<'b>(
520        &mut self,
521        header: &Header,
522        bytes: &'b [u8],
523    ) -> Result<&'b [u8], Error> {
524        let (bytes, rest) = try_split_at(
525            "transition times data block",
526            bytes,
527            header.transition_times_len()?,
528        )?;
529        let mut it = bytes.chunks_exact(header.time_size);
530        // RFC 8536 says: "If there are no transitions, local time for all
531        // timestamps is specified by the TZ string in the footer if present
532        // and nonempty; otherwise, it is specified by time type 0."
533        //
534        // RFC 8536 also says: "Local time for timestamps before the first
535        // transition is specified by the first time type (time type
536        // 0)."
537        //
538        // So if there are no transitions, pushing this dummy one will result
539        // in the desired behavior even when it's the only transition.
540        // Similarly, since this is the minimum timestamp value, it will
541        // trigger for any times before the first transition found in the TZif
542        // data.
543        self.transitions.push(Transition {
544            timestamp: Timestamp::MIN,
545            wall: TransitionWall::Unambiguous { start: DateTime::MIN },
546            type_index: 0,
547        });
548        while let Some(chunk) = it.next() {
549            let seconds = if header.is_32bit() {
550                i64::from(from_be_bytes_i32(chunk))
551            } else {
552                from_be_bytes_i64(chunk)
553            };
554            let timestamp =
555                Timestamp::from_second(seconds).unwrap_or_else(|_| {
556                    // We really shouldn't error here just because the Unix
557                    // timestamp is outside what Jiff supports. Since what Jiff
558                    // supports is _somewhat_ arbitrary. But Jiff's supported
559                    // range is good enough for all realistic purposes, so we
560                    // just clamp an out-of-range Unix timestamp to the Jiff
561                    // min or max value.
562                    //
563                    // This can't result in the sorting order being wrong, but
564                    // it can result in a transition that is duplicative with
565                    // the dummy transition we inserted above. This should be
566                    // fine.
567                    let clamped = seconds
568                        .clamp(UnixSeconds::MIN_REPR, UnixSeconds::MAX_REPR);
569                    warn!(
570                        "found Unix timestamp {seconds} that is outside \
571                         Jiff's supported range, clamping to {clamped}",
572                    );
573                    // Guaranteed to succeed since we clamped `seconds` such
574                    // that it is in the supported range of `Timestamp`.
575                    Timestamp::from_second(clamped).unwrap()
576                });
577            self.transitions.push(Transition {
578                timestamp,
579                // We can't compute the wall clock times until we know the
580                // actual offset for the transition prior to this one. We don't
581                // know that until we parse the local time types.
582                wall: TransitionWall::Unambiguous {
583                    start: DateTime::default(),
584                },
585                // We can't fill in the type index either. We fill this in
586                // later when we parse the transition types.
587                type_index: 0,
588            });
589        }
590        assert!(it.remainder().is_empty());
591        Ok(rest)
592    }
593
594    fn parse_transition_types<'b>(
595        &mut self,
596        header: &Header,
597        bytes: &'b [u8],
598    ) -> Result<&'b [u8], Error> {
599        let (bytes, rest) = try_split_at(
600            "transition types data block",
601            bytes,
602            header.transition_types_len()?,
603        )?;
604        // We start our transition indices at 1 because we always insert a
605        // dummy first transition corresponding to `Timestamp::MIN`. Its type
606        // index is always 0, so there's no need to change it here.
607        for (transition_index, &type_index) in (1..).zip(bytes) {
608            if usize::from(type_index) >= header.tzh_typecnt {
609                return Err(err!(
610                    "found transition type index {type_index},
611                     but there are only {} local time types",
612                    header.tzh_typecnt,
613                ));
614            }
615            self.transitions[transition_index].type_index = type_index;
616        }
617        Ok(rest)
618    }
619
620    fn parse_local_time_types<'b>(
621        &mut self,
622        header: &Header,
623        bytes: &'b [u8],
624    ) -> Result<&'b [u8], Error> {
625        let (bytes, rest) = try_split_at(
626            "local time types data block",
627            bytes,
628            header.local_time_types_len()?,
629        )?;
630        let mut it = bytes.chunks_exact(6);
631        while let Some(chunk) = it.next() {
632            let offset_seconds = from_be_bytes_i32(&chunk[..4]);
633            let offset =
634                Offset::from_seconds(offset_seconds).map_err(|e| {
635                    err!(
636                        "found local time type with out-of-bounds offset: {e}"
637                    )
638                })?;
639            let is_dst = Dst::from(chunk[4] == 1);
640            let designation = chunk[5]..chunk[5];
641            self.types.push(LocalTimeType {
642                offset,
643                is_dst,
644                designation,
645                indicator: Indicator::LocalWall,
646            });
647        }
648        assert!(it.remainder().is_empty());
649        Ok(rest)
650    }
651
652    fn parse_time_zone_designations<'b>(
653        &mut self,
654        header: &Header,
655        bytes: &'b [u8],
656    ) -> Result<&'b [u8], Error> {
657        let (bytes, rest) = try_split_at(
658            "time zone designations data block",
659            bytes,
660            header.time_zone_designations_len()?,
661        )?;
662        self.designations =
663            String::from_utf8(bytes.to_vec()).map_err(|_| {
664                err!(
665                    "time zone designations are not valid UTF-8: {:?}",
666                    Bytes(bytes),
667                )
668            })?;
669        // Holy hell, this is brutal. The boundary conditions are crazy.
670        for (i, typ) in self.types.iter_mut().enumerate() {
671            let start = usize::from(typ.designation.start);
672            let Some(suffix) = self.designations.get(start..) else {
673                return Err(err!(
674                    "local time type {i} has designation index of {start}, \
675                     but cannot be more than {}",
676                    self.designations.len(),
677                ));
678            };
679            let Some(len) = suffix.find('\x00') else {
680                return Err(err!(
681                    "local time type {i} has designation index of {start}, \
682                     but could not find NUL terminator after it in \
683                     designations: {:?}",
684                    self.designations,
685                ));
686            };
687            let Some(end) = start.checked_add(len) else {
688                return Err(err!(
689                    "local time type {i} has designation index of {start}, \
690                     but its length {len} is too big",
691                ));
692            };
693            typ.designation.end = u8::try_from(end).map_err(|_| {
694                err!(
695                    "local time type {i} has designation range of \
696                     {start}..{end}, but end is too big",
697                )
698            })?;
699        }
700        Ok(rest)
701    }
702
703    fn parse_leap_seconds<'b>(
704        &mut self,
705        header: &Header,
706        bytes: &'b [u8],
707    ) -> Result<&'b [u8], Error> {
708        let (bytes, rest) = try_split_at(
709            "leap seconds data block",
710            bytes,
711            header.leap_second_len()?,
712        )?;
713        let chunk_len = header
714            .time_size
715            .checked_add(4)
716            .expect("time_size plus 4 fits in usize");
717        let mut it = bytes.chunks_exact(chunk_len);
718        while let Some(chunk) = it.next() {
719            let (occur_bytes, corr_bytes) = chunk.split_at(header.time_size);
720            let occur_seconds = if header.is_32bit() {
721                i64::from(from_be_bytes_i32(occur_bytes))
722            } else {
723                from_be_bytes_i64(occur_bytes)
724            };
725            let occurrence =
726                Timestamp::from_second(occur_seconds).map_err(|e| {
727                    err!(
728                        "leap second occurrence {occur_seconds} \
729                         is out of range: {e}"
730                    )
731                })?;
732            let correction = from_be_bytes_i32(corr_bytes);
733            self.leap_seconds.push(LeapSecond { occurrence, correction });
734        }
735        assert!(it.remainder().is_empty());
736        Ok(rest)
737    }
738
739    fn parse_indicators<'b>(
740        &mut self,
741        header: &Header,
742        bytes: &'b [u8],
743    ) -> Result<&'b [u8], Error> {
744        let (std_wall_bytes, rest) = try_split_at(
745            "standard/wall indicators data block",
746            bytes,
747            header.standard_wall_len()?,
748        )?;
749        let (ut_local_bytes, rest) = try_split_at(
750            "UT/local indicators data block",
751            rest,
752            header.ut_local_len()?,
753        )?;
754        if std_wall_bytes.is_empty() && !ut_local_bytes.is_empty() {
755            // This is a weird case, but technically possible only if all
756            // UT/local indicators are 0. If any are 1, then it's an error,
757            // because it would require the corresponding std/wall indicator
758            // to be 1 too. Which it can't be, because there aren't any. So
759            // we just check that they're all zeros.
760            for (i, &byte) in ut_local_bytes.iter().enumerate() {
761                if byte != 0 {
762                    return Err(err!(
763                        "found UT/local indicator '{byte}' for local time \
764                         type {i}, but it must be 0 since all std/wall \
765                         indicators are 0",
766                    ));
767                }
768            }
769        } else if !std_wall_bytes.is_empty() && ut_local_bytes.is_empty() {
770            for (i, &byte) in std_wall_bytes.iter().enumerate() {
771                // Indexing is OK because Header guarantees that the number of
772                // indicators is 0 or equal to the number of types.
773                self.types[i].indicator = if byte == 0 {
774                    Indicator::LocalWall
775                } else if byte == 1 {
776                    Indicator::LocalStandard
777                } else {
778                    return Err(err!(
779                        "found invalid std/wall indicator '{byte}' for \
780                         local time type {i}, it must be 0 or 1",
781                    ));
782                };
783            }
784        } else if !std_wall_bytes.is_empty() && !ut_local_bytes.is_empty() {
785            assert_eq!(std_wall_bytes.len(), ut_local_bytes.len());
786            let it = std_wall_bytes.iter().zip(ut_local_bytes);
787            for (i, (&stdwall, &utlocal)) in it.enumerate() {
788                // Indexing is OK because Header guarantees that the number of
789                // indicators is 0 or equal to the number of types.
790                self.types[i].indicator = match (stdwall, utlocal) {
791                    (0, 0) => Indicator::LocalWall,
792                    (1, 0) => Indicator::LocalStandard,
793                    (1, 1) => Indicator::UTStandard,
794                    (0, 1) => {
795                        return Err(err!(
796                            "found illegal ut-wall combination for \
797                         local time type {i}, only local-wall, local-standard \
798                         and ut-standard are allowed",
799                        ))
800                    }
801                    _ => {
802                        return Err(err!(
803                            "found illegal std/wall or ut/local value for \
804                         local time type {i}, each must be 0 or 1",
805                        ))
806                    }
807                };
808            }
809        } else {
810            // If they're both empty then we don't need to do anything. Every
811            // local time type record already has the correct default for this
812            // case set.
813            debug_assert!(std_wall_bytes.is_empty());
814            debug_assert!(ut_local_bytes.is_empty());
815        }
816        Ok(rest)
817    }
818
819    fn parse_footer<'b>(
820        &mut self,
821        _header: &Header,
822        bytes: &'b [u8],
823    ) -> Result<&'b [u8], Error> {
824        if bytes.is_empty() {
825            return Err(err!(
826                "invalid V2+ TZif footer, expected \\n, \
827                 but found unexpected end of data",
828            ));
829        }
830        if bytes[0] != b'\n' {
831            return Err(err!(
832                "invalid V2+ TZif footer, expected {:?}, but found {:?}",
833                Byte(b'\n'),
834                Byte(bytes[0]),
835            ));
836        }
837        let bytes = &bytes[1..];
838        // Only scan up to 1KB for a NUL terminator in case we somehow got
839        // passed a huge block of bytes.
840        let toscan = &bytes[..bytes.len().min(1024)];
841        let Some(nlat) = toscan.iter().position(|&b| b == b'\n') else {
842            return Err(err!(
843                "invalid V2 TZif footer, could not find {:?} \
844                 terminator in: {:?}",
845                Byte(b'\n'),
846                Bytes(toscan),
847            ));
848        };
849        let (bytes, rest) = bytes.split_at(nlat);
850        if !bytes.is_empty() {
851            // We could in theory limit TZ strings to their strict POSIX
852            // definition here for TZif V2, but I don't think there is any
853            // harm in allowing the extensions in V2 formatted TZif data. Note
854            // that the GNU tooling allow it via the `TZ` environment variable
855            // even though POSIX doesn't specify it. This all seems okay to me
856            // because the V3+ extension is a strict superset of functionality.
857            let iana_tz = IanaTz::parse_v3plus(bytes)?;
858            self.posix_tz = Some(iana_tz.into_tz());
859        }
860        Ok(&rest[1..])
861    }
862
863    /// This sets the wall clock times for each transition.
864    ///
865    /// The wall clock time corresponds to time on the clock that the
866    /// transition begins. That is, it is the time offset by the previous
867    /// transition's offset.
868    ///
869    /// This also computes whether there is a gap or fold or neither between
870    /// each transition. This is used to resolve ambiguous timestamps when
871    /// given a civil datetime.
872    fn set_wall_datetimes(&mut self) {
873        let mut prev = self.local_time_type(self.first_transition()).offset;
874        // We iterate over indices instead of `transitions.iter_mut()` because
875        // of the borrow checker breaking composition.
876        for i in 0..self.transitions.len() {
877            let this = self.local_time_type(&self.transitions[i]).offset;
878            let t = &mut self.transitions[i];
879            t.wall = if prev == this {
880                // Equivalent offsets means there can never be any ambiguity.
881                let start = prev.to_datetime(t.timestamp);
882                TransitionWall::Unambiguous { start }
883            } else if prev < this {
884                // When the offset of the previous transition is less, that
885                // means there is some non-zero amount of time that is
886                // "skipped" when moving to the next transition. Thus, we have
887                // a gap. The start of the gap is the offset which gets us the
888                // earliest time, i.e., the smaller of the two offsets.
889                let start = prev.to_datetime(t.timestamp);
890                let end = this.to_datetime(t.timestamp);
891                TransitionWall::Gap { start, end }
892            } else {
893                // When the offset of the previous transition is greater, that
894                // means there is some non-zero amount of time that will be
895                // replayed on a wall clock in this time zone. Thus, we have
896                // a fold. The start of the gold is the offset which gets us
897                // the earliest time, i.e., the smaller of the two offsets.
898                assert!(prev > this);
899                let start = this.to_datetime(t.timestamp);
900                let end = prev.to_datetime(t.timestamp);
901                TransitionWall::Fold { start, end }
902            };
903            prev = this;
904        }
905    }
906}
907
908impl Eq for Tzif {}
909
910impl PartialEq for Tzif {
911    fn eq(&self, rhs: &Tzif) -> bool {
912        self.name == rhs.name && self.checksum == rhs.checksum
913    }
914}
915
916/// A transition to a different offset.
917#[derive(Clone, Debug, Eq, PartialEq)]
918struct Transition {
919    /// The UNIX leap time at which the transition starts. The transition
920    /// continues up to and _not_ including the next transition.
921    timestamp: Timestamp,
922    /// The wall clock time for when this transition begins. This includes
923    /// boundary conditions for quickly determining if a given wall clock time
924    /// is ambiguous (i.e., falls in a gap or a fold).
925    wall: TransitionWall,
926    /// The index into the sequence of local time type records. This is what
927    /// provides the correct offset (from UTC) that is active beginning at
928    /// this transition.
929    type_index: u8,
930}
931
932/// The wall clock time for when a transition begins.
933///
934/// This explicitly represents ambiguous wall clock times that occur at the
935/// boundaries of transitions.
936///
937/// The start of the wall clock time is always the earlier possible wall clock
938/// time that could occur with this transition's corresponding offset. For a
939/// gap, it's the previous transition's offset. For a fold, it's the current
940/// transition's offset.
941///
942/// For example, DST for `America/New_York` began on `2024-03-10T07:00:00+00`.
943/// The offset prior to this instant in time is `-05`, corresponding
944/// to standard time (EST). Thus, in wall clock time, DST began at
945/// `2024-03-10T02:00:00`. And since this is a DST transition that jumps ahead
946/// an hour, the start of DST also corresponds to the start of a gap. That is,
947/// the times `02:00:00` through `02:59:59` never appear on a clock for this
948/// hour. The question is thus: which offset should we apply to `02:00:00`?
949/// We could apply the offset from the earlier transition `-05` and get
950/// `2024-03-10T01:00:00-05` (that's `2024-03-10T06:00:00+00`), or we could
951/// apply the offset from the later transition `-04` and get
952/// `2024-03-10T03:00:00-04` (that's `2024-03-10T07:00:00+00`).
953///
954/// So in the above, we would have a `Gap` variant where `start` (inclusive) is
955/// `2024-03-10T02:00:00` and `end` (exclusive) is `2024-03-10T03:00:00`.
956///
957/// The fold case is the same idea, but where the same time is repeated.
958/// For example, in `America/New_York`, standard time began on
959/// `2024-11-03T06:00:00+00`. The offset prior to this instant in time
960/// is `-04`, corresponding to DST (EDT). Thus, in wall clock time, DST
961/// ended at `2024-11-03T02:00:00`. However, since this is a fold, the
962/// actual set of ambiguous times begins at `2024-11-03T01:00:00` and
963/// ends at `2024-11-03T01:59:59.999999999`. That is, the wall clock time
964/// `2024-11-03T02:00:00` is unambiguous.
965///
966/// So in the fold case above, we would have a `Fold` variant where
967/// `start` (inclusive) is `2024-11-03T01:00:00` and `end` (exclusive) is
968/// `2024-11-03T02:00:00`.
969///
970/// Since this gets bundled in with the sorted sequence of transitions, we'll
971/// use the "start" time in all three cases as our target of binary search.
972/// Once we land on a transition, we'll know our given wall clock time is
973/// greater than or equal to its start wall clock time. At that point, to
974/// determine if there is ambiguity, we merely need to determine if the given
975/// wall clock time is less than the corresponding `end` time. If it is, then
976/// it falls in a gap or fold. Otherwise, it's unambiguous.
977///
978/// Note that we could compute these datetime values while searching for the
979/// correct transition, but there's a fair bit of math involved in going
980/// between timestamps (which is what TZif gives us) and calendar datetimes
981/// (which is what we're given as input). It is also necessary that we offset
982/// the timestamp given in TZif at some point, since it is in UTC and the
983/// datetime given is in wall clock time. So I decided it would be worth
984/// pre-computing what we need in terms of what the input is. This way, we
985/// don't need to do any conversions, or indeed, any arithmetic at all, for
986/// time zone lookups. We *could* store these as transitions, but then the
987/// input datetime would need to be converted to a timestamp before searching
988/// the transitions.
989#[derive(Clone, Debug, Eq, PartialEq)]
990enum TransitionWall {
991    /// This transition cannot possibly lead to an unambiguous offset because
992    /// its offset is equivalent to the offset of the previous transition.
993    Unambiguous {
994        /// The civil datetime corresponding to the beginning of this
995        /// transition, inclusive.
996        start: DateTime,
997    },
998    /// This occurs when this transition's offset is strictly greater than the
999    /// previous transition's offset. This effectively results in a "gap" of
1000    /// time equal to the difference in the offsets between the two
1001    /// transitions.
1002    Gap {
1003        /// The start of a gap (inclusive) in wall clock time.
1004        start: DateTime,
1005        /// The end of the gap (exclusive) in wall clock time.
1006        end: DateTime,
1007    },
1008    /// This occurs when this transition's offset is strictly less than the
1009    /// previous transition's offset. This results in a "fold" of time where
1010    /// the two transitions have an overlap where it is ambiguous which one
1011    /// applies given a wall clock time. In effect, a span of time equal to the
1012    /// difference in the offsets is repeated.
1013    Fold {
1014        /// The start of the fold (inclusive) in wall clock time.
1015        start: DateTime,
1016        /// The end of the fold (exclusive) in wall clock time.
1017        end: DateTime,
1018    },
1019}
1020
1021impl TransitionWall {
1022    fn start(&self) -> DateTime {
1023        match *self {
1024            TransitionWall::Unambiguous { start } => start,
1025            TransitionWall::Gap { start, .. } => start,
1026            TransitionWall::Fold { start, .. } => start,
1027        }
1028    }
1029}
1030
1031/// A single local time type.
1032///
1033/// Basically, this is what transition times map to. Once you have a local time
1034/// type, then you know the offset, whether it's in DST and the corresponding
1035/// abbreviation. (There is also an "indicator," but I have no clue what it
1036/// means. See the `Indicator` type for a rant.)
1037#[derive(Clone, Debug, Eq, PartialEq)]
1038struct LocalTimeType {
1039    offset: Offset,
1040    is_dst: Dst,
1041    designation: Range<u8>,
1042    indicator: Indicator,
1043}
1044
1045impl LocalTimeType {
1046    fn designation(&self) -> Range<usize> {
1047        usize::from(self.designation.start)..usize::from(self.designation.end)
1048    }
1049}
1050
1051/// This enum corresponds to the possible indicator values for standard/wall
1052/// and UT/local.
1053///
1054/// Note that UT+Wall is not allowed.
1055///
1056/// I honestly have no earthly clue what they mean. I've read the section about
1057/// them in RFC 8536 several times and I can't make sense of it. I've even
1058/// looked at data files that have these set and still can't make sense of
1059/// them. I've even looked at what other datetime libraries do with these, and
1060/// they all seem to just ignore them. Like, WTF. I've spent the last couple
1061/// months of my life steeped in time, and I just cannot figure this out. Am I
1062/// just dumb?
1063///
1064/// Anyway, we parse them, but otherwise ignore them because that's what all
1065/// the cool kids do.
1066///
1067/// The default is `LocalWall`, which also occurs when no indicators are
1068/// present.
1069///
1070/// I tried again and still don't get it. Here's a dump for `Pacific/Honolulu`:
1071///
1072/// ```text
1073/// $ ./scripts/jiff-debug tzif /usr/share/zoneinfo/Pacific/Honolulu
1074/// TIME ZONE NAME
1075///   /usr/share/zoneinfo/Pacific/Honolulu
1076/// LOCAL TIME TYPES
1077///   000: offset=-10:31:26, is_dst=false, designation=LMT, indicator=local/wall
1078///   001: offset=-10:30, is_dst=false, designation=HST, indicator=local/wall
1079///   002: offset=-09:30, is_dst=true, designation=HDT, indicator=local/wall
1080///   003: offset=-09:30, is_dst=true, designation=HWT, indicator=local/wall
1081///   004: offset=-09:30, is_dst=true, designation=HPT, indicator=ut/std
1082///   005: offset=-10, is_dst=false, designation=HST, indicator=local/wall
1083/// TRANSITIONS
1084///   0000: -9999-01-02T01:59:59 :: -377705023201 :: type=0, -10:31:26, is_dst=false, LMT, local/wall
1085///   0001: 1896-01-13T22:31:26 :: -2334101314 :: type=1, -10:30, is_dst=false, HST, local/wall
1086///   0002: 1933-04-30T12:30:00 :: -1157283000 :: type=2, -09:30, is_dst=true, HDT, local/wall
1087///   0003: 1933-05-21T21:30:00 :: -1155436200 :: type=1, -10:30, is_dst=false, HST, local/wall
1088///   0004: 1942-02-09T12:30:00 :: -880198200 :: type=3, -09:30, is_dst=true, HWT, local/wall
1089///   0005: 1945-08-14T23:00:00 :: -769395600 :: type=4, -09:30, is_dst=true, HPT, ut/std
1090///   0006: 1945-09-30T11:30:00 :: -765376200 :: type=1, -10:30, is_dst=false, HST, local/wall
1091///   0007: 1947-06-08T12:30:00 :: -712150200 :: type=5, -10, is_dst=false, HST, local/wall
1092/// POSIX TIME ZONE STRING
1093///   HST10
1094/// ```
1095///
1096/// See how type 004 has a ut/std indicator? What the fuck does that mean?
1097/// All transitions are defined in terms of UTC. I confirmed this with `zdump`:
1098///
1099/// ```text
1100/// $ zdump -v Pacific/Honolulu | rg 1945
1101/// Pacific/Honolulu  Tue Aug 14 22:59:59 1945 UT = Tue Aug 14 13:29:59 1945 HWT isdst=1 gmtoff=-34200
1102/// Pacific/Honolulu  Tue Aug 14 23:00:00 1945 UT = Tue Aug 14 13:30:00 1945 HPT isdst=1 gmtoff=-34200
1103/// Pacific/Honolulu  Sun Sep 30 11:29:59 1945 UT = Sun Sep 30 01:59:59 1945 HPT isdst=1 gmtoff=-34200
1104/// Pacific/Honolulu  Sun Sep 30 11:30:00 1945 UT = Sun Sep 30 01:00:00 1945 HST isdst=0 gmtoff=-37800
1105/// ```
1106///
1107/// The times match up. All of them. The indicators don't seem to make a
1108/// difference. I'm clearly missing something.
1109#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1110enum Indicator {
1111    LocalWall,
1112    LocalStandard,
1113    UTStandard,
1114}
1115
1116impl core::fmt::Display for Indicator {
1117    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1118        match *self {
1119            Indicator::LocalWall => write!(f, "local/wall"),
1120            Indicator::LocalStandard => write!(f, "local/std"),
1121            Indicator::UTStandard => write!(f, "ut/std"),
1122        }
1123    }
1124}
1125
1126/// A leap second "correction" record.
1127#[derive(Clone, Debug, Eq, PartialEq)]
1128struct LeapSecond {
1129    /// The Unix leap time at which the leap second occurred.
1130    occurrence: Timestamp,
1131    /// The leap second offset. Usually +1 or -1.
1132    correction: i32,
1133}
1134
1135/// The header for a TZif formatted file.
1136///
1137/// V2+ TZif format have two headers: one for V1 data, and then a second
1138/// following the V1 data block that describes another data block which uses
1139/// 64-bit timestamps. The two headers both have the same format and both
1140/// use 32-bit big-endian encoded integers.
1141#[derive(Debug)]
1142struct Header {
1143    /// The size of the timestamps encoded in the data block.
1144    ///
1145    /// This is guaranteed to be either 4 (for V1) or 8 (for the 64-bit header
1146    /// block in V2+).
1147    time_size: usize,
1148    /// The file format version.
1149    ///
1150    /// Note that this is either a NUL byte (for version 1), or an ASCII byte
1151    /// corresponding to the version number. That is, `0x32` for `2`, `0x33`
1152    /// for `3` or `0x34` for `4`. Note also that just because zoneinfo might
1153    /// have been recently generated does not mean it uses the latest format
1154    /// version. It seems like newer versions are only compiled by `zic` when
1155    /// they are needed. For example, `America/New_York` on my system (as of
1156    /// `2024-03-25`) has version `0x32`, but `Asia/Jerusalem` has version
1157    /// `0x33`.
1158    version: u8,
1159    /// Number of UT/local indicators stored in the file.
1160    ///
1161    /// This is checked to be either equal to `0` or equal to `tzh_typecnt`.
1162    tzh_ttisutcnt: usize,
1163    /// The number of standard/wall indicators stored in the file.
1164    ///
1165    /// This is checked to be either equal to `0` or equal to `tzh_typecnt`.
1166    tzh_ttisstdcnt: usize,
1167    /// The number of leap seconds for which data entries are stored in the
1168    /// file.
1169    tzh_leapcnt: usize,
1170    /// The number of transition times for which data entries are stored in
1171    /// the file.
1172    tzh_timecnt: usize,
1173    /// The number of local time types for which data entries are stored in the
1174    /// file.
1175    ///
1176    /// This is checked to be at least `1`.
1177    tzh_typecnt: usize,
1178    /// The number of bytes of time zone abbreviation strings stored in the
1179    /// file.
1180    ///
1181    /// This is checked to be at least `1`.
1182    tzh_charcnt: usize,
1183}
1184
1185impl Header {
1186    /// Parse the header record from the given bytes.
1187    ///
1188    /// Upon success, return the header and all bytes after the header.
1189    ///
1190    /// The given `time_size` must be 4 or 8, corresponding to either the
1191    /// V1 header block or the V2+ header block, respectively.
1192    fn parse(
1193        time_size: usize,
1194        bytes: &[u8],
1195    ) -> Result<(Header, &[u8]), Error> {
1196        assert!(time_size == 4 || time_size == 8, "time size must be 4 or 8");
1197        if bytes.len() < 44 {
1198            return Err(err!("invalid header: too short"));
1199        }
1200        let (magic, rest) = bytes.split_at(4);
1201        if magic != b"TZif" {
1202            return Err(err!("invalid header: magic bytes mismatch"));
1203        }
1204        let (version, rest) = rest.split_at(1);
1205        let (_reserved, rest) = rest.split_at(15);
1206
1207        let (tzh_ttisutcnt_bytes, rest) = rest.split_at(4);
1208        let (tzh_ttisstdcnt_bytes, rest) = rest.split_at(4);
1209        let (tzh_leapcnt_bytes, rest) = rest.split_at(4);
1210        let (tzh_timecnt_bytes, rest) = rest.split_at(4);
1211        let (tzh_typecnt_bytes, rest) = rest.split_at(4);
1212        let (tzh_charcnt_bytes, rest) = rest.split_at(4);
1213
1214        let tzh_ttisutcnt = from_be_bytes_u32_to_usize(tzh_ttisutcnt_bytes)
1215            .map_err(|e| e.context("failed to parse tzh_ttisutcnt"))?;
1216        let tzh_ttisstdcnt = from_be_bytes_u32_to_usize(tzh_ttisstdcnt_bytes)
1217            .map_err(|e| e.context("failed to parse tzh_ttisstdcnt"))?;
1218        let tzh_leapcnt = from_be_bytes_u32_to_usize(tzh_leapcnt_bytes)
1219            .map_err(|e| e.context("failed to parse tzh_leapcnt"))?;
1220        let tzh_timecnt = from_be_bytes_u32_to_usize(tzh_timecnt_bytes)
1221            .map_err(|e| e.context("failed to parse tzh_timecnt"))?;
1222        let tzh_typecnt = from_be_bytes_u32_to_usize(tzh_typecnt_bytes)
1223            .map_err(|e| e.context("failed to parse tzh_typecnt"))?;
1224        let tzh_charcnt = from_be_bytes_u32_to_usize(tzh_charcnt_bytes)
1225            .map_err(|e| e.context("failed to parse tzh_charcnt"))?;
1226
1227        if tzh_ttisutcnt != 0 && tzh_ttisutcnt != tzh_typecnt {
1228            return Err(err!(
1229                "expected tzh_ttisutcnt={tzh_ttisutcnt} to be zero \
1230                 or equal to tzh_typecnt={tzh_typecnt}",
1231            ));
1232        }
1233        if tzh_ttisstdcnt != 0 && tzh_ttisstdcnt != tzh_typecnt {
1234            return Err(err!(
1235                "expected tzh_ttisstdcnt={tzh_ttisstdcnt} to be zero \
1236                 or equal to tzh_typecnt={tzh_typecnt}",
1237            ));
1238        }
1239        if tzh_typecnt < 1 {
1240            return Err(err!(
1241                "expected tzh_typecnt={tzh_typecnt} to be at least 1",
1242            ));
1243        }
1244        if tzh_charcnt < 1 {
1245            return Err(err!(
1246                "expected tzh_charcnt={tzh_charcnt} to be at least 1",
1247            ));
1248        }
1249
1250        let header = Header {
1251            time_size,
1252            version: version[0],
1253            tzh_ttisutcnt,
1254            tzh_ttisstdcnt,
1255            tzh_leapcnt,
1256            tzh_timecnt,
1257            tzh_typecnt,
1258            tzh_charcnt,
1259        };
1260        Ok((header, rest))
1261    }
1262
1263    /// Returns true if this header is for a 32-bit data block.
1264    ///
1265    /// When false, it is guaranteed that this header is for a 64-bit data
1266    /// block.
1267    fn is_32bit(&self) -> bool {
1268        self.time_size == 4
1269    }
1270
1271    /// Returns the size of the data block, in bytes, for this header.
1272    ///
1273    /// This returns an error if the arithmetic required to compute the
1274    /// length would overflow.
1275    ///
1276    /// This is useful for, e.g., skipping over the 32-bit V1 data block in
1277    /// V2+ TZif formatted files.
1278    fn data_block_len(&self) -> Result<usize, Error> {
1279        let a = self.transition_times_len()?;
1280        let b = self.transition_types_len()?;
1281        let c = self.local_time_types_len()?;
1282        let d = self.time_zone_designations_len()?;
1283        let e = self.leap_second_len()?;
1284        let f = self.standard_wall_len()?;
1285        let g = self.ut_local_len()?;
1286        a.checked_add(b)
1287            .and_then(|z| z.checked_add(c))
1288            .and_then(|z| z.checked_add(d))
1289            .and_then(|z| z.checked_add(e))
1290            .and_then(|z| z.checked_add(f))
1291            .and_then(|z| z.checked_add(g))
1292            .ok_or_else(|| {
1293                err!(
1294                    "length of data block in V{} tzfile is too big",
1295                    self.version
1296                )
1297            })
1298    }
1299
1300    fn transition_times_len(&self) -> Result<usize, Error> {
1301        self.tzh_timecnt.checked_mul(self.time_size).ok_or_else(|| {
1302            err!("tzh_timecnt value {} is too big", self.tzh_timecnt)
1303        })
1304    }
1305
1306    fn transition_types_len(&self) -> Result<usize, Error> {
1307        Ok(self.tzh_timecnt)
1308    }
1309
1310    fn local_time_types_len(&self) -> Result<usize, Error> {
1311        self.tzh_typecnt.checked_mul(6).ok_or_else(|| {
1312            err!("tzh_typecnt value {} is too big", self.tzh_typecnt)
1313        })
1314    }
1315
1316    fn time_zone_designations_len(&self) -> Result<usize, Error> {
1317        Ok(self.tzh_charcnt)
1318    }
1319
1320    fn leap_second_len(&self) -> Result<usize, Error> {
1321        let record_len = self
1322            .time_size
1323            .checked_add(4)
1324            .expect("4-or-8 plus 4 always fits in usize");
1325        self.tzh_leapcnt.checked_mul(record_len).ok_or_else(|| {
1326            err!("tzh_leapcnt value {} is too big", self.tzh_leapcnt)
1327        })
1328    }
1329
1330    fn standard_wall_len(&self) -> Result<usize, Error> {
1331        Ok(self.tzh_ttisstdcnt)
1332    }
1333
1334    fn ut_local_len(&self) -> Result<usize, Error> {
1335        Ok(self.tzh_ttisutcnt)
1336    }
1337}
1338
1339/// Does a quick check that returns true if the data might be in TZif format.
1340///
1341/// It is possible that this returns true even if the given data is not in TZif
1342/// format. However, it is impossible for this to return false when the given
1343/// data is TZif. That is, a false positive is allowed but a false negative is
1344/// not.
1345#[cfg(feature = "tzdb-zoneinfo")]
1346pub(crate) fn is_possibly_tzif(data: &[u8]) -> bool {
1347    data.starts_with(b"TZif")
1348}
1349
1350/// Interprets the given slice as an unsigned 32-bit big endian integer,
1351/// attempts to convert it to a `usize` and returns it.
1352///
1353/// # Panics
1354///
1355/// When `bytes.len() != 4`.
1356///
1357/// # Errors
1358///
1359/// This errors if the `u32` parsed from the given bytes cannot fit in a
1360/// `usize`.
1361fn from_be_bytes_u32_to_usize(bytes: &[u8]) -> Result<usize, Error> {
1362    let n = from_be_bytes_u32(bytes);
1363    usize::try_from(n).map_err(|_| {
1364        err!(
1365            "failed to parse integer {n} (too big, max allowed is {}",
1366            usize::MAX
1367        )
1368    })
1369}
1370
1371/// Interprets the given slice as an unsigned 32-bit big endian integer and
1372/// returns it.
1373///
1374/// # Panics
1375///
1376/// When `bytes.len() != 4`.
1377fn from_be_bytes_u32(bytes: &[u8]) -> u32 {
1378    u32::from_be_bytes(bytes.try_into().unwrap())
1379}
1380
1381/// Interprets the given slice as a signed 32-bit big endian integer and
1382/// returns it.
1383///
1384/// # Panics
1385///
1386/// When `bytes.len() != 4`.
1387fn from_be_bytes_i32(bytes: &[u8]) -> i32 {
1388    i32::from_be_bytes(bytes.try_into().unwrap())
1389}
1390
1391/// Interprets the given slice as a signed 64-bit big endian integer and
1392/// returns it.
1393///
1394/// # Panics
1395///
1396/// When `bytes.len() != 8`.
1397fn from_be_bytes_i64(bytes: &[u8]) -> i64 {
1398    i64::from_be_bytes(bytes.try_into().unwrap())
1399}
1400
1401/// Splits the given slice of bytes at the index given.
1402///
1403/// If the index is out of range (greater than `bytes.len()`) then an error is
1404/// returned. The error message will include the `what` string given, which is
1405/// meant to describe the thing being split.
1406fn try_split_at<'b>(
1407    what: &'static str,
1408    bytes: &'b [u8],
1409    at: usize,
1410) -> Result<(&'b [u8], &'b [u8]), Error> {
1411    if at > bytes.len() {
1412        Err(err!(
1413            "expected at least {at} bytes for {what}, \
1414             but found only {} bytes",
1415            bytes.len(),
1416        ))
1417    } else {
1418        Ok(bytes.split_at(at))
1419    }
1420}
1421
1422#[cfg(test)]
1423mod tests {
1424    use alloc::string::ToString;
1425
1426    use crate::tz::testdata::TZIF_TEST_FILES;
1427
1428    use super::*;
1429
1430    /// This converts TZif data into a human readable format.
1431    ///
1432    /// This is useful for debugging (via `./scripts/jiff-debug tzif`), but we
1433    /// also use it for snapshot testing to make reading the test output at
1434    /// least *somewhat* comprehensible for humans. Otherwise, one needs to
1435    /// read and understand Unix timestamps. That ain't going to fly.
1436    ///
1437    /// For this to work, we make sure everything in a `Tzif` value is
1438    /// represented in some way in this output.
1439    fn tzif_to_human_readable(tzif: &Tzif) -> String {
1440        use std::io::Write;
1441
1442        let mut out = tabwriter::TabWriter::new(vec![])
1443            .alignment(tabwriter::Alignment::Left);
1444
1445        writeln!(out, "TIME ZONE NAME").unwrap();
1446        writeln!(out, "  {}", tzif.name().unwrap_or("UNNAMED")).unwrap();
1447
1448        writeln!(out, "TIME ZONE VERSION").unwrap();
1449        writeln!(out, "  {}", char::try_from(tzif.version).unwrap()).unwrap();
1450
1451        writeln!(out, "LOCAL TIME TYPES").unwrap();
1452        for (i, typ) in tzif.types.iter().enumerate() {
1453            writeln!(
1454                out,
1455                "  {i:03}:\toffset={off}\t\
1456                   designation={desig}\t{dst}\tindicator={ind}",
1457                off = typ.offset,
1458                desig = tzif.designation(&typ),
1459                dst = if typ.is_dst.is_dst() { "dst" } else { "" },
1460                ind = typ.indicator,
1461            )
1462            .unwrap();
1463        }
1464        if !tzif.transitions.is_empty() {
1465            writeln!(out, "TRANSITIONS").unwrap();
1466            for (i, t) in tzif.transitions.iter().enumerate() {
1467                let dt = Offset::UTC.to_datetime(t.timestamp);
1468                let typ = &tzif.types[usize::from(t.type_index)];
1469                let wall = alloc::format!("{:?}", t.wall.start());
1470                let ambiguous = match t.wall {
1471                    TransitionWall::Unambiguous { .. } => {
1472                        "unambiguous".to_string()
1473                    }
1474                    TransitionWall::Gap { end, .. } => {
1475                        alloc::format!(" gap-until({end:?})")
1476                    }
1477                    TransitionWall::Fold { end, .. } => {
1478                        alloc::format!("fold-until({end:?})")
1479                    }
1480                };
1481
1482                writeln!(
1483                    out,
1484                    "  {i:04}:\t{dt:?}Z\tunix={ts}\twall={wall}\t\
1485                       {ambiguous}\t\
1486                       type={type_index}\t{off}\t\
1487                       {desig}\t{dst}",
1488                    ts = t.timestamp.as_second(),
1489                    type_index = t.type_index,
1490                    off = typ.offset,
1491                    desig = tzif.designation(typ),
1492                    dst = if typ.is_dst.is_dst() { "dst" } else { "" },
1493                )
1494                .unwrap();
1495            }
1496        }
1497        if !tzif.leap_seconds.is_empty() {
1498            writeln!(out, "LEAP SECONDS").unwrap();
1499            for ls in tzif.leap_seconds.iter() {
1500                let dt = Offset::UTC.to_datetime(ls.occurrence);
1501                let c = ls.correction;
1502                writeln!(out, "  {dt:?}\tcorrection={c}").unwrap();
1503            }
1504        }
1505        if let Some(ref posix_tz) = tzif.posix_tz {
1506            writeln!(out, "POSIX TIME ZONE STRING").unwrap();
1507            writeln!(out, "  {}", posix_tz).unwrap();
1508        }
1509        String::from_utf8(out.into_inner().unwrap()).unwrap()
1510    }
1511
1512    /// DEBUG COMMAND
1513    ///
1514    /// Takes environment variable `JIFF_DEBUG_TZIF_PATH` as input, and treats
1515    /// the value as a TZif file path. This test will open the file, parse it
1516    /// as a TZif and then dump debug data about the file in a human readable
1517    /// plain text format.
1518    #[cfg(feature = "std")]
1519    #[test]
1520    fn debug_tzif() -> anyhow::Result<()> {
1521        use anyhow::Context;
1522
1523        let _ = crate::logging::Logger::init();
1524
1525        const ENV: &str = "JIFF_DEBUG_TZIF_PATH";
1526        let Some(val) = std::env::var_os(ENV) else { return Ok(()) };
1527        let Ok(val) = val.into_string() else {
1528            anyhow::bail!("{ENV} has invalid UTF-8")
1529        };
1530        let bytes =
1531            std::fs::read(&val).with_context(|| alloc::format!("{val:?}"))?;
1532        let tzif = Tzif::parse(Some(val.to_string()), &bytes)?;
1533        std::eprint!("{}", tzif_to_human_readable(&tzif));
1534        Ok(())
1535    }
1536
1537    #[test]
1538    fn tzif_parse_v2plus() {
1539        for tzif_test in TZIF_TEST_FILES {
1540            insta::assert_snapshot!(
1541                alloc::format!("{}_v2+", tzif_test.name),
1542                tzif_to_human_readable(&tzif_test.parse())
1543            );
1544        }
1545    }
1546
1547    #[test]
1548    fn tzif_parse_v1() {
1549        for tzif_test in TZIF_TEST_FILES {
1550            insta::assert_snapshot!(
1551                alloc::format!("{}_v1", tzif_test.name),
1552                tzif_to_human_readable(&tzif_test.parse_v1())
1553            );
1554        }
1555    }
1556
1557    /// This tests walks the /usr/share/zoneinfo directory (if it exists) and
1558    /// tries to parse every TZif formatted file it can find. We don't really
1559    /// do much with it other than to ensure we don't panic or return an error.
1560    /// That is, we check that we can parse each file, but not that we do so
1561    /// correctly.
1562    #[cfg(feature = "tzdb-zoneinfo")]
1563    #[cfg(target_os = "linux")]
1564    #[test]
1565    fn zoneinfo() {
1566        const TZDIR: &str = "/usr/share/zoneinfo";
1567
1568        for result in walkdir::WalkDir::new(TZDIR) {
1569            // Just skip if we got an error traversing the directory tree.
1570            // These aren't related to our parsing, so it's some other problem
1571            // (like the directory not existing).
1572            let Ok(dent) = result else { continue };
1573            // This test can take some time in debug mode, so skip parsing
1574            // some of the less frequently used TZif files.
1575            let Some(name) = dent.path().to_str() else { continue };
1576            if name.contains("right/") || name.contains("posix/") {
1577                continue;
1578            }
1579            // Again, skip if we can't read. Not my monkeys, not my circus.
1580            let Ok(bytes) = std::fs::read(dent.path()) else { continue };
1581            if !is_possibly_tzif(&bytes) {
1582                continue;
1583            }
1584            let tzname = dent
1585                .path()
1586                .strip_prefix(TZDIR)
1587                .unwrap_or_else(|_| {
1588                    panic!("all paths in TZDIR have {TZDIR:?} prefix")
1589                })
1590                .to_str()
1591                .expect("all paths to be valid UTF-8")
1592                .to_string();
1593            // OK at this point, we're pretty sure `bytes` should be a TZif
1594            // binary file. So try to parse it and fail the test if it fails.
1595            if let Err(err) = Tzif::parse(Some(tzname), &bytes) {
1596                panic!("failed to parse TZif file {:?}: {err}", dent.path());
1597            }
1598        }
1599    }
1600}