jiff/tz/tzif.rs
1/*!
2This module provides support for TZif binary files from the [Time Zone
3Database].
4
5These binary files are the ones commonly found in Unix distributions in the
6`/usr/share/zoneinfo` directory.
7
8[Time Zone Database]: https://www.iana.org/time-zones
9*/
10
11use core::ops::Range;
12
13use alloc::{string::String, vec, vec::Vec};
14
15use crate::{
16 civil::DateTime,
17 error::{err, Error, ErrorContext},
18 timestamp::Timestamp,
19 tz::{
20 posix::{IanaTz, ReasonablePosixTimeZone},
21 AmbiguousOffset, Dst, Offset, TimeZoneAbbreviation,
22 TimeZoneOffsetInfo, TimeZoneTransition,
23 },
24 util::{
25 crc32,
26 escape::{Byte, Bytes},
27 t::UnixSeconds,
28 },
29};
30
31/// A time zone based on IANA TZif formatted data.
32///
33/// TZif is a binary format described by RFC 8536. Its typical structure is to
34/// define a single time zone per file in the `/usr/share/zoneinfo` directory
35/// on Unix systems. The name of a time zone is its file path with the
36/// `/usr/share/zoneinfo/` prefix stripped from it.
37///
38/// This type doesn't provide any facilities for dealing with files on disk
39/// or the `/usr/share/zoneinfo` directory. This type is just for parsing the
40/// contents of TZif formatted data in memory, and turning it into a data type
41/// that can be used as a time zone.
42#[derive(Debug)]
43pub(crate) struct Tzif {
44 name: Option<String>,
45 /// An ASCII byte corresponding to the version number. So, 0x50 is '2'.
46 ///
47 /// This is unused. It's only used in `test` compilation for emitting
48 /// diagnostic data about TZif files. If we really need to use this, we
49 /// should probably just convert it to an actual integer.
50 #[allow(dead_code)]
51 version: u8,
52 checksum: u32,
53 transitions: Vec<Transition>,
54 types: Vec<LocalTimeType>,
55 designations: String,
56 leap_seconds: Vec<LeapSecond>,
57 posix_tz: Option<ReasonablePosixTimeZone>,
58}
59
60impl Tzif {
61 /// Parses the given data as a TZif formatted file.
62 ///
63 /// The name given is attached to the `Tzif` value returned, but is
64 /// otherwise not significant.
65 ///
66 /// If the given data is not recognized to be valid TZif, then an error is
67 /// returned.
68 ///
69 /// In general, callers may assume that it is safe to pass arbitrary or
70 /// even untrusted data to this function and count on it not panicking
71 /// or using resources that aren't limited to a small constant factor of
72 /// the size of the data itself. That is, callers can reliably limit the
73 /// resources used by limiting the size of the data given to this parse
74 /// function.
75 pub(crate) fn parse(
76 name: Option<String>,
77 bytes: &[u8],
78 ) -> Result<Tzif, Error> {
79 let original = bytes;
80 let name = name.into();
81 let (header32, rest) = Header::parse(4, bytes)
82 .map_err(|e| e.context("failed to parse 32-bit header"))?;
83 let (mut tzif, rest) = if header32.version == 0 {
84 Tzif::parse32(name, header32, rest)?
85 } else {
86 Tzif::parse64(name, header32, rest)?
87 };
88 // Compute the checksum using the entire contents of the TZif data.
89 let tzif_raw_len = (rest.as_ptr() as usize)
90 .checked_sub(original.as_ptr() as usize)
91 .unwrap();
92 let tzif_raw_bytes = &original[..tzif_raw_len];
93 tzif.checksum = crc32::sum(tzif_raw_bytes);
94 Ok(tzif)
95 }
96
97 /// Returns the name given to this TZif data in its constructor.
98 pub(crate) fn name(&self) -> Option<&str> {
99 self.name.as_deref()
100 }
101
102 /// Returns the appropriate time zone offset to use for the given
103 /// timestamp.
104 pub(crate) fn to_offset(&self, timestamp: Timestamp) -> Offset {
105 match self.to_local_time_type(timestamp) {
106 Ok(typ) => typ.offset,
107 Err(tz) => tz.to_offset(timestamp),
108 }
109 }
110
111 /// Returns the appropriate time zone offset to use for the given
112 /// timestamp.
113 ///
114 /// This also includes whether the offset returned should be considered to
115 /// be DST or not, along with the time zone abbreviation (e.g., EST for
116 /// standard time in New York, and EDT for DST in New York).
117 pub(crate) fn to_offset_info(
118 &self,
119 timestamp: Timestamp,
120 ) -> TimeZoneOffsetInfo<'_> {
121 let typ = match self.to_local_time_type(timestamp) {
122 Ok(typ) => typ,
123 Err(tz) => return tz.to_offset_info(timestamp),
124 };
125 let abbreviation =
126 TimeZoneAbbreviation::Borrowed(self.designation(typ));
127 TimeZoneOffsetInfo {
128 offset: typ.offset,
129 dst: typ.is_dst,
130 abbreviation,
131 }
132 }
133
134 /// Returns the local time type for the timestamp given.
135 ///
136 /// If one could not be found, then this implies that the caller should
137 /// use the POSIX time zone returned in the error variant.
138 fn to_local_time_type(
139 &self,
140 timestamp: Timestamp,
141 ) -> Result<&LocalTimeType, &ReasonablePosixTimeZone> {
142 // This is guaranteed because we always push at least one transition.
143 // This isn't guaranteed by TZif since it might have 0 transitions,
144 // but we always add a "dummy" first transition with our minimum
145 // `Timestamp` value. TZif doesn't do this because there is no
146 // universal minimum timestamp. (`i64::MIN` is a candidate, but that's
147 // likely to cause overflow in readers that don't do error checking.)
148 //
149 // The result of the dummy transition is that the code below is simpler
150 // with fewer special cases.
151 assert!(!self.transitions.is_empty(), "transitions is non-empty");
152 let index = if timestamp > self.transitions.last().unwrap().timestamp {
153 self.transitions.len() - 1
154 } else {
155 let search = self
156 .transitions
157 // It is an optimization to compare only by the second instead
158 // of the second and the nanosecond. This works for two
159 // reasons. Firstly, the timestamps in TZif are limited to
160 // second precision. Secondly, this may result in two
161 // timestamps comparing equal when they would otherwise be
162 // unequal (for example, when a timestamp given falls on a
163 // transition, but has non-zero fractional seconds). But this
164 // is okay, because it would otherwise get an `Err(i)`, and
165 // access `i-1`. i.e., The timestamp it compared equal to.
166 .binary_search_by_key(×tamp.as_second(), |t| {
167 t.timestamp.as_second()
168 });
169 match search {
170 // Since the first transition is always Timestamp::MIN, it's
171 // impossible for any timestamp to sort before it.
172 Err(0) => {
173 unreachable!("impossible to come before Timestamp::MIN")
174 }
175 Ok(i) => i,
176 // i points to the position immediately after the matching
177 // timestamp. And since we know that i>0 because of the i==0
178 // check above, we can safely subtract 1.
179 Err(i) => i.checked_sub(1).expect("i is non-zero"),
180 }
181 };
182 // Our index is always in bounds. The only way it couldn't be is if
183 // binary search returns an Err(len) for a time greater than the
184 // maximum transition. But we account for that above by converting
185 // Err(len) to Err(len-1).
186 assert!(index < self.transitions.len());
187 // RFC 8536 says: "Local time for timestamps on or after the last
188 // transition is specified by the TZ string in the footer (Section 3.3)
189 // if present and nonempty; otherwise, it is unspecified."
190 //
191 // Subtracting 1 is OK because we know self.transitions is not empty.
192 let t = if index < self.transitions.len() - 1 {
193 // This is the typical case in "fat" TZif files: we found a
194 // matching transition.
195 &self.transitions[index]
196 } else {
197 match self.posix_tz.as_ref() {
198 // This is the typical case in "slim" TZif files, where the
199 // last transition is, as I understand it, the transition at
200 // which a consistent rule started that a POSIX TZ string can
201 // fully describe. For example, (as of 2024-03-27) the last
202 // transition in the "fat" America/New_York TZif file is
203 // in 2037, where as in the "slim" version it is 2007.
204 //
205 // This is likely why some things break with the "slim"
206 // version: they don't support POSIX TZ strings (or don't
207 // support them correctly).
208 Some(tz) => return Err(tz),
209 // This case is technically unspecified, but I think the
210 // typical thing to do is to just use the last transition.
211 // I'm not 100% sure on this one.
212 None => &self.transitions[index],
213 }
214 };
215 Ok(self.local_time_type(t))
216 }
217
218 /// Returns a possibly ambiguous timestamp for the given civil datetime.
219 ///
220 /// The given datetime should correspond to the "wall" clock time of what
221 /// humans use to tell time for this time zone.
222 ///
223 /// Note that "ambiguous timestamp" is represented by the possible
224 /// selection of offsets that could be applied to the given datetime. In
225 /// general, it is only ambiguous around transitions to-and-from DST. The
226 /// ambiguity can arise as a "fold" (when a particular wall clock time is
227 /// repeated) or as a "gap" (when a particular wall clock time is skipped
228 /// entirely).
229 pub(crate) fn to_ambiguous_kind(&self, dt: DateTime) -> AmbiguousOffset {
230 // This implementation very nearly mirrors `to_offset` above in the
231 // beginning: we do a binary search to find transition applicable for
232 // the given datetime. Except, we do it on wall clock times instead
233 // of timestamps. And in particular, each transition begins with a
234 // possibly ambiguous range of wall clock times corresponding to either
235 // a "gap" or "fold" in time.
236 assert!(!self.transitions.is_empty(), "transitions is non-empty");
237 let search =
238 self.transitions.binary_search_by_key(&dt, |t| t.wall.start());
239 let this_index = match search {
240 Err(0) => unreachable!("impossible to come before DateTime::MIN"),
241 Ok(i) => i,
242 Err(i) => i.checked_sub(1).expect("i is non-zero"),
243 };
244 assert!(this_index < self.transitions.len());
245
246 let this = &self.transitions[this_index];
247 let this_offset = self.local_time_type(this).offset;
248 // This is a little tricky, but we need to check for ambiguous civil
249 // datetimes before possibly using the POSIX TZ string. Namely, a
250 // datetime could be ambiguous with respect to the last transition,
251 // and we should handle that according to the gap/fold determined for
252 // that transition. We cover this case in tests in tz/mod.rs for the
253 // Pacific/Honolulu time zone, whose last transition begins with a gap.
254 match this.wall {
255 TransitionWall::Gap { end, .. } if dt < end => {
256 // A gap/fold can only appear when there exists a previous
257 // transition.
258 let prev_index = this_index.checked_sub(1).unwrap();
259 let prev = &self.transitions[prev_index];
260 let prev_offset = self.local_time_type(prev).offset;
261 return AmbiguousOffset::Gap {
262 before: prev_offset,
263 after: this_offset,
264 };
265 }
266 TransitionWall::Fold { end, .. } if dt < end => {
267 // A gap/fold can only appear when there exists a previous
268 // transition.
269 let prev_index = this_index.checked_sub(1).unwrap();
270 let prev = &self.transitions[prev_index];
271 let prev_offset = self.local_time_type(prev).offset;
272 return AmbiguousOffset::Fold {
273 before: prev_offset,
274 after: this_offset,
275 };
276 }
277 _ => {}
278 }
279 // The datetime given is not ambiguous with respect to any of the
280 // transitions in the TZif data. But, if we matched at or after the
281 // last transition, then we need to use the POSIX TZ string (which
282 // could still return an ambiguous offset).
283 if this_index == self.transitions.len() - 1 {
284 if let Some(tz) = self.posix_tz.as_ref() {
285 return tz.to_ambiguous_kind(dt);
286 }
287 // This case is unspecified according to RFC 8536. It means that
288 // the given datetime exceeds all transitions *and* there is no
289 // POSIX TZ string. So this can happen in V1 files for example.
290 // But those should hopefully be essentially non-existent nowadays
291 // (2024-03). In any case, we just fall through to using the last
292 // transition, which does seem likely to be wrong ~half the time
293 // in time zones with DST. But there really isn't much else we can
294 // do I think.
295 }
296 AmbiguousOffset::Unambiguous { offset: this_offset }
297 }
298
299 /// Returns the timestamp of the most recent time zone transition prior
300 /// to the timestamp given. If one doesn't exist, `None` is returned.
301 pub(crate) fn previous_transition(
302 &self,
303 ts: Timestamp,
304 ) -> Option<TimeZoneTransition> {
305 assert!(!self.transitions.is_empty(), "transitions is non-empty");
306 let search =
307 self.transitions.binary_search_by_key(&ts, |t| t.timestamp);
308 let index = match search {
309 Ok(i) | Err(i) => i.checked_sub(1)?,
310 };
311 let trans = if index == 0 {
312 // The first transition is a dummy that we insert, so if we land on
313 // it here, treat it as if it doesn't exist.
314 return None;
315 } else if index == self.transitions.len() - 1 {
316 if let Some(ref posix_tz) = self.posix_tz {
317 // Since the POSIX TZ must be consistent with the last
318 // transition, it must be the case that tzif_last <=
319 // posix_prev_trans in all cases. So the transition according
320 // to the POSIX TZ is always correct here.
321 //
322 // What if this returns `None` though? I'm not sure in which
323 // cases that could matter, and I think it might be a violation
324 // of the TZif format if it does.
325 return posix_tz.previous_transition(ts);
326 }
327 &self.transitions[index]
328 } else {
329 &self.transitions[index]
330 };
331 let typ = &self.types[usize::from(trans.type_index)];
332 Some(TimeZoneTransition {
333 timestamp: trans.timestamp,
334 offset: typ.offset,
335 abbrev: self.designation(typ),
336 dst: typ.is_dst,
337 })
338 }
339
340 /// Returns the timestamp of the soonest time zone transition after the
341 /// timestamp given. If one doesn't exist, `None` is returned.
342 pub(crate) fn next_transition(
343 &self,
344 ts: Timestamp,
345 ) -> Option<TimeZoneTransition> {
346 assert!(!self.transitions.is_empty(), "transitions is non-empty");
347 let search =
348 self.transitions.binary_search_by_key(&ts, |t| t.timestamp);
349 let index = match search {
350 Ok(i) => i.checked_add(1)?,
351 Err(i) => i,
352 };
353 let trans = if index == 0 {
354 // The first transition is a dummy that we insert, so if we land on
355 // it here, treat it as if it doesn't exist.
356 return None;
357 } else if index >= self.transitions.len() - 1 {
358 if let Some(ref posix_tz) = self.posix_tz {
359 // Since the POSIX TZ must be consistent with the last
360 // transition, it must be the case that next.timestamp <=
361 // posix_next_tans in all cases. So the transition according to
362 // the POSIX TZ is always correct here.
363 //
364 // What if this returns `None` though? I'm not sure in which
365 // cases that could matter, and I think it might be a violation
366 // of the TZif format if it does.
367 return posix_tz.next_transition(ts);
368 }
369 self.transitions.last().expect("last transition")
370 } else {
371 &self.transitions[index]
372 };
373 let typ = &self.types[usize::from(trans.type_index)];
374 Some(TimeZoneTransition {
375 timestamp: trans.timestamp,
376 offset: typ.offset,
377 abbrev: self.designation(typ),
378 dst: typ.is_dst,
379 })
380 }
381
382 fn designation(&self, typ: &LocalTimeType) -> &str {
383 // OK because we verify that the designation range on every local
384 // time type is a valid range into `self.designations`.
385 &self.designations[typ.designation()]
386 }
387
388 fn local_time_type(&self, transition: &Transition) -> &LocalTimeType {
389 // OK because we require that `type_index` always points to a valid
390 // local time type.
391 &self.types[usize::from(transition.type_index)]
392 }
393
394 fn first_transition(&self) -> &Transition {
395 // OK because we know we have at least one transition. This isn't
396 // true generally of the TZif format, since it does actually permit 0
397 // transitions. But as part of parsing, we always add a "dummy" first
398 // transition corresponding to the minimum possible Jiff timestamp.
399 // This makes some logic for transition lookups a little simpler by
400 // reducing special cases.
401 self.transitions.first().unwrap()
402 }
403
404 fn parse32<'b>(
405 name: Option<String>,
406 header32: Header,
407 bytes: &'b [u8],
408 ) -> Result<(Tzif, &'b [u8]), Error> {
409 let mut tzif = Tzif {
410 name,
411 version: header32.version,
412 // filled in later
413 checksum: 0,
414 transitions: vec![],
415 types: vec![],
416 designations: String::new(),
417 leap_seconds: vec![],
418 posix_tz: None,
419 };
420 let rest = tzif.parse_transitions(&header32, bytes)?;
421 let rest = tzif.parse_transition_types(&header32, rest)?;
422 let rest = tzif.parse_local_time_types(&header32, rest)?;
423 let rest = tzif.parse_time_zone_designations(&header32, rest)?;
424 let rest = tzif.parse_leap_seconds(&header32, rest)?;
425 let rest = tzif.parse_indicators(&header32, rest)?;
426 tzif.set_wall_datetimes();
427 Ok((tzif, rest))
428 }
429
430 fn parse64<'b>(
431 name: Option<String>,
432 header32: Header,
433 bytes: &'b [u8],
434 ) -> Result<(Tzif, &'b [u8]), Error> {
435 let (_, rest) = try_split_at(
436 "V1 TZif data block",
437 bytes,
438 header32.data_block_len()?,
439 )?;
440 let (header64, rest) = Header::parse(8, rest)
441 .map_err(|e| e.context("failed to parse 64-bit header"))?;
442 let mut tzif = Tzif {
443 name,
444 version: header64.version,
445 // filled in later
446 checksum: 0,
447 transitions: vec![],
448 types: vec![],
449 designations: String::new(),
450 leap_seconds: vec![],
451 posix_tz: None,
452 };
453 let rest = tzif.parse_transitions(&header64, rest)?;
454 let rest = tzif.parse_transition_types(&header64, rest)?;
455 let rest = tzif.parse_local_time_types(&header64, rest)?;
456 let rest = tzif.parse_time_zone_designations(&header64, rest)?;
457 let rest = tzif.parse_leap_seconds(&header64, rest)?;
458 let rest = tzif.parse_indicators(&header64, rest)?;
459 let rest = tzif.parse_footer(&header64, rest)?;
460 // Validates that the POSIX TZ string we parsed (if one exists) is
461 // consistent with the last transition in this time zone. This is
462 // required by RFC 8536.
463 //
464 // RFC 8536 says, "If the string is nonempty and one or more
465 // transitions appear in the version 2+ data, the string MUST be
466 // consistent with the last version 2+ transition."
467 //
468 // We need to be a little careful, since we always have at least one
469 // transition (accounting for the dummy `Timestamp::MIN` transition).
470 // So if we only have 1 transition and a POSIX TZ string, then we
471 // should not validate it since it's equivalent to the case of 0
472 // transitions and a POSIX TZ string.
473 if tzif.transitions.len() > 1 {
474 if let Some(ref tz) = tzif.posix_tz {
475 let last = tzif.transitions.last().expect("last transition");
476 let typ = tzif.local_time_type(last);
477 let info = tz.to_offset_info(last.timestamp);
478 if info.offset() != typ.offset {
479 return Err(err!(
480 "expected last transition to have DST offset \
481 of {}, but got {} according to POSIX TZ \
482 string {}",
483 typ.offset,
484 info.offset(),
485 tz,
486 ));
487 }
488 if info.dst() != typ.is_dst {
489 return Err(err!(
490 "expected last transition to have is_dst={}, \
491 but got is_dst={} according to POSIX TZ \
492 string {}",
493 typ.is_dst.is_dst(),
494 info.dst().is_dst(),
495 tz,
496 ));
497 }
498 if info.abbreviation() != tzif.designation(&typ) {
499 return Err(err!(
500 "expected last transition to have \
501 designation={}, \
502 but got designation={} according to POSIX TZ \
503 string {}",
504 info.abbreviation(),
505 tzif.designation(&typ),
506 tz,
507 ));
508 }
509 }
510 }
511 tzif.set_wall_datetimes();
512 // N.B. We don't check that the TZif data is fully valid. It
513 // is possible for it to contain superfluous information. For
514 // example, a non-zero local time type that is never referenced
515 // by a transition.
516 Ok((tzif, rest))
517 }
518
519 fn parse_transitions<'b>(
520 &mut self,
521 header: &Header,
522 bytes: &'b [u8],
523 ) -> Result<&'b [u8], Error> {
524 let (bytes, rest) = try_split_at(
525 "transition times data block",
526 bytes,
527 header.transition_times_len()?,
528 )?;
529 let mut it = bytes.chunks_exact(header.time_size);
530 // RFC 8536 says: "If there are no transitions, local time for all
531 // timestamps is specified by the TZ string in the footer if present
532 // and nonempty; otherwise, it is specified by time type 0."
533 //
534 // RFC 8536 also says: "Local time for timestamps before the first
535 // transition is specified by the first time type (time type
536 // 0)."
537 //
538 // So if there are no transitions, pushing this dummy one will result
539 // in the desired behavior even when it's the only transition.
540 // Similarly, since this is the minimum timestamp value, it will
541 // trigger for any times before the first transition found in the TZif
542 // data.
543 self.transitions.push(Transition {
544 timestamp: Timestamp::MIN,
545 wall: TransitionWall::Unambiguous { start: DateTime::MIN },
546 type_index: 0,
547 });
548 while let Some(chunk) = it.next() {
549 let seconds = if header.is_32bit() {
550 i64::from(from_be_bytes_i32(chunk))
551 } else {
552 from_be_bytes_i64(chunk)
553 };
554 let timestamp =
555 Timestamp::from_second(seconds).unwrap_or_else(|_| {
556 // We really shouldn't error here just because the Unix
557 // timestamp is outside what Jiff supports. Since what Jiff
558 // supports is _somewhat_ arbitrary. But Jiff's supported
559 // range is good enough for all realistic purposes, so we
560 // just clamp an out-of-range Unix timestamp to the Jiff
561 // min or max value.
562 //
563 // This can't result in the sorting order being wrong, but
564 // it can result in a transition that is duplicative with
565 // the dummy transition we inserted above. This should be
566 // fine.
567 let clamped = seconds
568 .clamp(UnixSeconds::MIN_REPR, UnixSeconds::MAX_REPR);
569 warn!(
570 "found Unix timestamp {seconds} that is outside \
571 Jiff's supported range, clamping to {clamped}",
572 );
573 // Guaranteed to succeed since we clamped `seconds` such
574 // that it is in the supported range of `Timestamp`.
575 Timestamp::from_second(clamped).unwrap()
576 });
577 self.transitions.push(Transition {
578 timestamp,
579 // We can't compute the wall clock times until we know the
580 // actual offset for the transition prior to this one. We don't
581 // know that until we parse the local time types.
582 wall: TransitionWall::Unambiguous {
583 start: DateTime::default(),
584 },
585 // We can't fill in the type index either. We fill this in
586 // later when we parse the transition types.
587 type_index: 0,
588 });
589 }
590 assert!(it.remainder().is_empty());
591 Ok(rest)
592 }
593
594 fn parse_transition_types<'b>(
595 &mut self,
596 header: &Header,
597 bytes: &'b [u8],
598 ) -> Result<&'b [u8], Error> {
599 let (bytes, rest) = try_split_at(
600 "transition types data block",
601 bytes,
602 header.transition_types_len()?,
603 )?;
604 // We start our transition indices at 1 because we always insert a
605 // dummy first transition corresponding to `Timestamp::MIN`. Its type
606 // index is always 0, so there's no need to change it here.
607 for (transition_index, &type_index) in (1..).zip(bytes) {
608 if usize::from(type_index) >= header.tzh_typecnt {
609 return Err(err!(
610 "found transition type index {type_index},
611 but there are only {} local time types",
612 header.tzh_typecnt,
613 ));
614 }
615 self.transitions[transition_index].type_index = type_index;
616 }
617 Ok(rest)
618 }
619
620 fn parse_local_time_types<'b>(
621 &mut self,
622 header: &Header,
623 bytes: &'b [u8],
624 ) -> Result<&'b [u8], Error> {
625 let (bytes, rest) = try_split_at(
626 "local time types data block",
627 bytes,
628 header.local_time_types_len()?,
629 )?;
630 let mut it = bytes.chunks_exact(6);
631 while let Some(chunk) = it.next() {
632 let offset_seconds = from_be_bytes_i32(&chunk[..4]);
633 let offset =
634 Offset::from_seconds(offset_seconds).map_err(|e| {
635 err!(
636 "found local time type with out-of-bounds offset: {e}"
637 )
638 })?;
639 let is_dst = Dst::from(chunk[4] == 1);
640 let designation = chunk[5]..chunk[5];
641 self.types.push(LocalTimeType {
642 offset,
643 is_dst,
644 designation,
645 indicator: Indicator::LocalWall,
646 });
647 }
648 assert!(it.remainder().is_empty());
649 Ok(rest)
650 }
651
652 fn parse_time_zone_designations<'b>(
653 &mut self,
654 header: &Header,
655 bytes: &'b [u8],
656 ) -> Result<&'b [u8], Error> {
657 let (bytes, rest) = try_split_at(
658 "time zone designations data block",
659 bytes,
660 header.time_zone_designations_len()?,
661 )?;
662 self.designations =
663 String::from_utf8(bytes.to_vec()).map_err(|_| {
664 err!(
665 "time zone designations are not valid UTF-8: {:?}",
666 Bytes(bytes),
667 )
668 })?;
669 // Holy hell, this is brutal. The boundary conditions are crazy.
670 for (i, typ) in self.types.iter_mut().enumerate() {
671 let start = usize::from(typ.designation.start);
672 let Some(suffix) = self.designations.get(start..) else {
673 return Err(err!(
674 "local time type {i} has designation index of {start}, \
675 but cannot be more than {}",
676 self.designations.len(),
677 ));
678 };
679 let Some(len) = suffix.find('\x00') else {
680 return Err(err!(
681 "local time type {i} has designation index of {start}, \
682 but could not find NUL terminator after it in \
683 designations: {:?}",
684 self.designations,
685 ));
686 };
687 let Some(end) = start.checked_add(len) else {
688 return Err(err!(
689 "local time type {i} has designation index of {start}, \
690 but its length {len} is too big",
691 ));
692 };
693 typ.designation.end = u8::try_from(end).map_err(|_| {
694 err!(
695 "local time type {i} has designation range of \
696 {start}..{end}, but end is too big",
697 )
698 })?;
699 }
700 Ok(rest)
701 }
702
703 fn parse_leap_seconds<'b>(
704 &mut self,
705 header: &Header,
706 bytes: &'b [u8],
707 ) -> Result<&'b [u8], Error> {
708 let (bytes, rest) = try_split_at(
709 "leap seconds data block",
710 bytes,
711 header.leap_second_len()?,
712 )?;
713 let chunk_len = header
714 .time_size
715 .checked_add(4)
716 .expect("time_size plus 4 fits in usize");
717 let mut it = bytes.chunks_exact(chunk_len);
718 while let Some(chunk) = it.next() {
719 let (occur_bytes, corr_bytes) = chunk.split_at(header.time_size);
720 let occur_seconds = if header.is_32bit() {
721 i64::from(from_be_bytes_i32(occur_bytes))
722 } else {
723 from_be_bytes_i64(occur_bytes)
724 };
725 let occurrence =
726 Timestamp::from_second(occur_seconds).map_err(|e| {
727 err!(
728 "leap second occurrence {occur_seconds} \
729 is out of range: {e}"
730 )
731 })?;
732 let correction = from_be_bytes_i32(corr_bytes);
733 self.leap_seconds.push(LeapSecond { occurrence, correction });
734 }
735 assert!(it.remainder().is_empty());
736 Ok(rest)
737 }
738
739 fn parse_indicators<'b>(
740 &mut self,
741 header: &Header,
742 bytes: &'b [u8],
743 ) -> Result<&'b [u8], Error> {
744 let (std_wall_bytes, rest) = try_split_at(
745 "standard/wall indicators data block",
746 bytes,
747 header.standard_wall_len()?,
748 )?;
749 let (ut_local_bytes, rest) = try_split_at(
750 "UT/local indicators data block",
751 rest,
752 header.ut_local_len()?,
753 )?;
754 if std_wall_bytes.is_empty() && !ut_local_bytes.is_empty() {
755 // This is a weird case, but technically possible only if all
756 // UT/local indicators are 0. If any are 1, then it's an error,
757 // because it would require the corresponding std/wall indicator
758 // to be 1 too. Which it can't be, because there aren't any. So
759 // we just check that they're all zeros.
760 for (i, &byte) in ut_local_bytes.iter().enumerate() {
761 if byte != 0 {
762 return Err(err!(
763 "found UT/local indicator '{byte}' for local time \
764 type {i}, but it must be 0 since all std/wall \
765 indicators are 0",
766 ));
767 }
768 }
769 } else if !std_wall_bytes.is_empty() && ut_local_bytes.is_empty() {
770 for (i, &byte) in std_wall_bytes.iter().enumerate() {
771 // Indexing is OK because Header guarantees that the number of
772 // indicators is 0 or equal to the number of types.
773 self.types[i].indicator = if byte == 0 {
774 Indicator::LocalWall
775 } else if byte == 1 {
776 Indicator::LocalStandard
777 } else {
778 return Err(err!(
779 "found invalid std/wall indicator '{byte}' for \
780 local time type {i}, it must be 0 or 1",
781 ));
782 };
783 }
784 } else if !std_wall_bytes.is_empty() && !ut_local_bytes.is_empty() {
785 assert_eq!(std_wall_bytes.len(), ut_local_bytes.len());
786 let it = std_wall_bytes.iter().zip(ut_local_bytes);
787 for (i, (&stdwall, &utlocal)) in it.enumerate() {
788 // Indexing is OK because Header guarantees that the number of
789 // indicators is 0 or equal to the number of types.
790 self.types[i].indicator = match (stdwall, utlocal) {
791 (0, 0) => Indicator::LocalWall,
792 (1, 0) => Indicator::LocalStandard,
793 (1, 1) => Indicator::UTStandard,
794 (0, 1) => {
795 return Err(err!(
796 "found illegal ut-wall combination for \
797 local time type {i}, only local-wall, local-standard \
798 and ut-standard are allowed",
799 ))
800 }
801 _ => {
802 return Err(err!(
803 "found illegal std/wall or ut/local value for \
804 local time type {i}, each must be 0 or 1",
805 ))
806 }
807 };
808 }
809 } else {
810 // If they're both empty then we don't need to do anything. Every
811 // local time type record already has the correct default for this
812 // case set.
813 debug_assert!(std_wall_bytes.is_empty());
814 debug_assert!(ut_local_bytes.is_empty());
815 }
816 Ok(rest)
817 }
818
819 fn parse_footer<'b>(
820 &mut self,
821 _header: &Header,
822 bytes: &'b [u8],
823 ) -> Result<&'b [u8], Error> {
824 if bytes.is_empty() {
825 return Err(err!(
826 "invalid V2+ TZif footer, expected \\n, \
827 but found unexpected end of data",
828 ));
829 }
830 if bytes[0] != b'\n' {
831 return Err(err!(
832 "invalid V2+ TZif footer, expected {:?}, but found {:?}",
833 Byte(b'\n'),
834 Byte(bytes[0]),
835 ));
836 }
837 let bytes = &bytes[1..];
838 // Only scan up to 1KB for a NUL terminator in case we somehow got
839 // passed a huge block of bytes.
840 let toscan = &bytes[..bytes.len().min(1024)];
841 let Some(nlat) = toscan.iter().position(|&b| b == b'\n') else {
842 return Err(err!(
843 "invalid V2 TZif footer, could not find {:?} \
844 terminator in: {:?}",
845 Byte(b'\n'),
846 Bytes(toscan),
847 ));
848 };
849 let (bytes, rest) = bytes.split_at(nlat);
850 if !bytes.is_empty() {
851 // We could in theory limit TZ strings to their strict POSIX
852 // definition here for TZif V2, but I don't think there is any
853 // harm in allowing the extensions in V2 formatted TZif data. Note
854 // that the GNU tooling allow it via the `TZ` environment variable
855 // even though POSIX doesn't specify it. This all seems okay to me
856 // because the V3+ extension is a strict superset of functionality.
857 let iana_tz = IanaTz::parse_v3plus(bytes)?;
858 self.posix_tz = Some(iana_tz.into_tz());
859 }
860 Ok(&rest[1..])
861 }
862
863 /// This sets the wall clock times for each transition.
864 ///
865 /// The wall clock time corresponds to time on the clock that the
866 /// transition begins. That is, it is the time offset by the previous
867 /// transition's offset.
868 ///
869 /// This also computes whether there is a gap or fold or neither between
870 /// each transition. This is used to resolve ambiguous timestamps when
871 /// given a civil datetime.
872 fn set_wall_datetimes(&mut self) {
873 let mut prev = self.local_time_type(self.first_transition()).offset;
874 // We iterate over indices instead of `transitions.iter_mut()` because
875 // of the borrow checker breaking composition.
876 for i in 0..self.transitions.len() {
877 let this = self.local_time_type(&self.transitions[i]).offset;
878 let t = &mut self.transitions[i];
879 t.wall = if prev == this {
880 // Equivalent offsets means there can never be any ambiguity.
881 let start = prev.to_datetime(t.timestamp);
882 TransitionWall::Unambiguous { start }
883 } else if prev < this {
884 // When the offset of the previous transition is less, that
885 // means there is some non-zero amount of time that is
886 // "skipped" when moving to the next transition. Thus, we have
887 // a gap. The start of the gap is the offset which gets us the
888 // earliest time, i.e., the smaller of the two offsets.
889 let start = prev.to_datetime(t.timestamp);
890 let end = this.to_datetime(t.timestamp);
891 TransitionWall::Gap { start, end }
892 } else {
893 // When the offset of the previous transition is greater, that
894 // means there is some non-zero amount of time that will be
895 // replayed on a wall clock in this time zone. Thus, we have
896 // a fold. The start of the gold is the offset which gets us
897 // the earliest time, i.e., the smaller of the two offsets.
898 assert!(prev > this);
899 let start = this.to_datetime(t.timestamp);
900 let end = prev.to_datetime(t.timestamp);
901 TransitionWall::Fold { start, end }
902 };
903 prev = this;
904 }
905 }
906}
907
908impl Eq for Tzif {}
909
910impl PartialEq for Tzif {
911 fn eq(&self, rhs: &Tzif) -> bool {
912 self.name == rhs.name && self.checksum == rhs.checksum
913 }
914}
915
916/// A transition to a different offset.
917#[derive(Clone, Debug, Eq, PartialEq)]
918struct Transition {
919 /// The UNIX leap time at which the transition starts. The transition
920 /// continues up to and _not_ including the next transition.
921 timestamp: Timestamp,
922 /// The wall clock time for when this transition begins. This includes
923 /// boundary conditions for quickly determining if a given wall clock time
924 /// is ambiguous (i.e., falls in a gap or a fold).
925 wall: TransitionWall,
926 /// The index into the sequence of local time type records. This is what
927 /// provides the correct offset (from UTC) that is active beginning at
928 /// this transition.
929 type_index: u8,
930}
931
932/// The wall clock time for when a transition begins.
933///
934/// This explicitly represents ambiguous wall clock times that occur at the
935/// boundaries of transitions.
936///
937/// The start of the wall clock time is always the earlier possible wall clock
938/// time that could occur with this transition's corresponding offset. For a
939/// gap, it's the previous transition's offset. For a fold, it's the current
940/// transition's offset.
941///
942/// For example, DST for `America/New_York` began on `2024-03-10T07:00:00+00`.
943/// The offset prior to this instant in time is `-05`, corresponding
944/// to standard time (EST). Thus, in wall clock time, DST began at
945/// `2024-03-10T02:00:00`. And since this is a DST transition that jumps ahead
946/// an hour, the start of DST also corresponds to the start of a gap. That is,
947/// the times `02:00:00` through `02:59:59` never appear on a clock for this
948/// hour. The question is thus: which offset should we apply to `02:00:00`?
949/// We could apply the offset from the earlier transition `-05` and get
950/// `2024-03-10T01:00:00-05` (that's `2024-03-10T06:00:00+00`), or we could
951/// apply the offset from the later transition `-04` and get
952/// `2024-03-10T03:00:00-04` (that's `2024-03-10T07:00:00+00`).
953///
954/// So in the above, we would have a `Gap` variant where `start` (inclusive) is
955/// `2024-03-10T02:00:00` and `end` (exclusive) is `2024-03-10T03:00:00`.
956///
957/// The fold case is the same idea, but where the same time is repeated.
958/// For example, in `America/New_York`, standard time began on
959/// `2024-11-03T06:00:00+00`. The offset prior to this instant in time
960/// is `-04`, corresponding to DST (EDT). Thus, in wall clock time, DST
961/// ended at `2024-11-03T02:00:00`. However, since this is a fold, the
962/// actual set of ambiguous times begins at `2024-11-03T01:00:00` and
963/// ends at `2024-11-03T01:59:59.999999999`. That is, the wall clock time
964/// `2024-11-03T02:00:00` is unambiguous.
965///
966/// So in the fold case above, we would have a `Fold` variant where
967/// `start` (inclusive) is `2024-11-03T01:00:00` and `end` (exclusive) is
968/// `2024-11-03T02:00:00`.
969///
970/// Since this gets bundled in with the sorted sequence of transitions, we'll
971/// use the "start" time in all three cases as our target of binary search.
972/// Once we land on a transition, we'll know our given wall clock time is
973/// greater than or equal to its start wall clock time. At that point, to
974/// determine if there is ambiguity, we merely need to determine if the given
975/// wall clock time is less than the corresponding `end` time. If it is, then
976/// it falls in a gap or fold. Otherwise, it's unambiguous.
977///
978/// Note that we could compute these datetime values while searching for the
979/// correct transition, but there's a fair bit of math involved in going
980/// between timestamps (which is what TZif gives us) and calendar datetimes
981/// (which is what we're given as input). It is also necessary that we offset
982/// the timestamp given in TZif at some point, since it is in UTC and the
983/// datetime given is in wall clock time. So I decided it would be worth
984/// pre-computing what we need in terms of what the input is. This way, we
985/// don't need to do any conversions, or indeed, any arithmetic at all, for
986/// time zone lookups. We *could* store these as transitions, but then the
987/// input datetime would need to be converted to a timestamp before searching
988/// the transitions.
989#[derive(Clone, Debug, Eq, PartialEq)]
990enum TransitionWall {
991 /// This transition cannot possibly lead to an unambiguous offset because
992 /// its offset is equivalent to the offset of the previous transition.
993 Unambiguous {
994 /// The civil datetime corresponding to the beginning of this
995 /// transition, inclusive.
996 start: DateTime,
997 },
998 /// This occurs when this transition's offset is strictly greater than the
999 /// previous transition's offset. This effectively results in a "gap" of
1000 /// time equal to the difference in the offsets between the two
1001 /// transitions.
1002 Gap {
1003 /// The start of a gap (inclusive) in wall clock time.
1004 start: DateTime,
1005 /// The end of the gap (exclusive) in wall clock time.
1006 end: DateTime,
1007 },
1008 /// This occurs when this transition's offset is strictly less than the
1009 /// previous transition's offset. This results in a "fold" of time where
1010 /// the two transitions have an overlap where it is ambiguous which one
1011 /// applies given a wall clock time. In effect, a span of time equal to the
1012 /// difference in the offsets is repeated.
1013 Fold {
1014 /// The start of the fold (inclusive) in wall clock time.
1015 start: DateTime,
1016 /// The end of the fold (exclusive) in wall clock time.
1017 end: DateTime,
1018 },
1019}
1020
1021impl TransitionWall {
1022 fn start(&self) -> DateTime {
1023 match *self {
1024 TransitionWall::Unambiguous { start } => start,
1025 TransitionWall::Gap { start, .. } => start,
1026 TransitionWall::Fold { start, .. } => start,
1027 }
1028 }
1029}
1030
1031/// A single local time type.
1032///
1033/// Basically, this is what transition times map to. Once you have a local time
1034/// type, then you know the offset, whether it's in DST and the corresponding
1035/// abbreviation. (There is also an "indicator," but I have no clue what it
1036/// means. See the `Indicator` type for a rant.)
1037#[derive(Clone, Debug, Eq, PartialEq)]
1038struct LocalTimeType {
1039 offset: Offset,
1040 is_dst: Dst,
1041 designation: Range<u8>,
1042 indicator: Indicator,
1043}
1044
1045impl LocalTimeType {
1046 fn designation(&self) -> Range<usize> {
1047 usize::from(self.designation.start)..usize::from(self.designation.end)
1048 }
1049}
1050
1051/// This enum corresponds to the possible indicator values for standard/wall
1052/// and UT/local.
1053///
1054/// Note that UT+Wall is not allowed.
1055///
1056/// I honestly have no earthly clue what they mean. I've read the section about
1057/// them in RFC 8536 several times and I can't make sense of it. I've even
1058/// looked at data files that have these set and still can't make sense of
1059/// them. I've even looked at what other datetime libraries do with these, and
1060/// they all seem to just ignore them. Like, WTF. I've spent the last couple
1061/// months of my life steeped in time, and I just cannot figure this out. Am I
1062/// just dumb?
1063///
1064/// Anyway, we parse them, but otherwise ignore them because that's what all
1065/// the cool kids do.
1066///
1067/// The default is `LocalWall`, which also occurs when no indicators are
1068/// present.
1069///
1070/// I tried again and still don't get it. Here's a dump for `Pacific/Honolulu`:
1071///
1072/// ```text
1073/// $ ./scripts/jiff-debug tzif /usr/share/zoneinfo/Pacific/Honolulu
1074/// TIME ZONE NAME
1075/// /usr/share/zoneinfo/Pacific/Honolulu
1076/// LOCAL TIME TYPES
1077/// 000: offset=-10:31:26, is_dst=false, designation=LMT, indicator=local/wall
1078/// 001: offset=-10:30, is_dst=false, designation=HST, indicator=local/wall
1079/// 002: offset=-09:30, is_dst=true, designation=HDT, indicator=local/wall
1080/// 003: offset=-09:30, is_dst=true, designation=HWT, indicator=local/wall
1081/// 004: offset=-09:30, is_dst=true, designation=HPT, indicator=ut/std
1082/// 005: offset=-10, is_dst=false, designation=HST, indicator=local/wall
1083/// TRANSITIONS
1084/// 0000: -9999-01-02T01:59:59 :: -377705023201 :: type=0, -10:31:26, is_dst=false, LMT, local/wall
1085/// 0001: 1896-01-13T22:31:26 :: -2334101314 :: type=1, -10:30, is_dst=false, HST, local/wall
1086/// 0002: 1933-04-30T12:30:00 :: -1157283000 :: type=2, -09:30, is_dst=true, HDT, local/wall
1087/// 0003: 1933-05-21T21:30:00 :: -1155436200 :: type=1, -10:30, is_dst=false, HST, local/wall
1088/// 0004: 1942-02-09T12:30:00 :: -880198200 :: type=3, -09:30, is_dst=true, HWT, local/wall
1089/// 0005: 1945-08-14T23:00:00 :: -769395600 :: type=4, -09:30, is_dst=true, HPT, ut/std
1090/// 0006: 1945-09-30T11:30:00 :: -765376200 :: type=1, -10:30, is_dst=false, HST, local/wall
1091/// 0007: 1947-06-08T12:30:00 :: -712150200 :: type=5, -10, is_dst=false, HST, local/wall
1092/// POSIX TIME ZONE STRING
1093/// HST10
1094/// ```
1095///
1096/// See how type 004 has a ut/std indicator? What the fuck does that mean?
1097/// All transitions are defined in terms of UTC. I confirmed this with `zdump`:
1098///
1099/// ```text
1100/// $ zdump -v Pacific/Honolulu | rg 1945
1101/// Pacific/Honolulu Tue Aug 14 22:59:59 1945 UT = Tue Aug 14 13:29:59 1945 HWT isdst=1 gmtoff=-34200
1102/// Pacific/Honolulu Tue Aug 14 23:00:00 1945 UT = Tue Aug 14 13:30:00 1945 HPT isdst=1 gmtoff=-34200
1103/// Pacific/Honolulu Sun Sep 30 11:29:59 1945 UT = Sun Sep 30 01:59:59 1945 HPT isdst=1 gmtoff=-34200
1104/// Pacific/Honolulu Sun Sep 30 11:30:00 1945 UT = Sun Sep 30 01:00:00 1945 HST isdst=0 gmtoff=-37800
1105/// ```
1106///
1107/// The times match up. All of them. The indicators don't seem to make a
1108/// difference. I'm clearly missing something.
1109#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1110enum Indicator {
1111 LocalWall,
1112 LocalStandard,
1113 UTStandard,
1114}
1115
1116impl core::fmt::Display for Indicator {
1117 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1118 match *self {
1119 Indicator::LocalWall => write!(f, "local/wall"),
1120 Indicator::LocalStandard => write!(f, "local/std"),
1121 Indicator::UTStandard => write!(f, "ut/std"),
1122 }
1123 }
1124}
1125
1126/// A leap second "correction" record.
1127#[derive(Clone, Debug, Eq, PartialEq)]
1128struct LeapSecond {
1129 /// The Unix leap time at which the leap second occurred.
1130 occurrence: Timestamp,
1131 /// The leap second offset. Usually +1 or -1.
1132 correction: i32,
1133}
1134
1135/// The header for a TZif formatted file.
1136///
1137/// V2+ TZif format have two headers: one for V1 data, and then a second
1138/// following the V1 data block that describes another data block which uses
1139/// 64-bit timestamps. The two headers both have the same format and both
1140/// use 32-bit big-endian encoded integers.
1141#[derive(Debug)]
1142struct Header {
1143 /// The size of the timestamps encoded in the data block.
1144 ///
1145 /// This is guaranteed to be either 4 (for V1) or 8 (for the 64-bit header
1146 /// block in V2+).
1147 time_size: usize,
1148 /// The file format version.
1149 ///
1150 /// Note that this is either a NUL byte (for version 1), or an ASCII byte
1151 /// corresponding to the version number. That is, `0x32` for `2`, `0x33`
1152 /// for `3` or `0x34` for `4`. Note also that just because zoneinfo might
1153 /// have been recently generated does not mean it uses the latest format
1154 /// version. It seems like newer versions are only compiled by `zic` when
1155 /// they are needed. For example, `America/New_York` on my system (as of
1156 /// `2024-03-25`) has version `0x32`, but `Asia/Jerusalem` has version
1157 /// `0x33`.
1158 version: u8,
1159 /// Number of UT/local indicators stored in the file.
1160 ///
1161 /// This is checked to be either equal to `0` or equal to `tzh_typecnt`.
1162 tzh_ttisutcnt: usize,
1163 /// The number of standard/wall indicators stored in the file.
1164 ///
1165 /// This is checked to be either equal to `0` or equal to `tzh_typecnt`.
1166 tzh_ttisstdcnt: usize,
1167 /// The number of leap seconds for which data entries are stored in the
1168 /// file.
1169 tzh_leapcnt: usize,
1170 /// The number of transition times for which data entries are stored in
1171 /// the file.
1172 tzh_timecnt: usize,
1173 /// The number of local time types for which data entries are stored in the
1174 /// file.
1175 ///
1176 /// This is checked to be at least `1`.
1177 tzh_typecnt: usize,
1178 /// The number of bytes of time zone abbreviation strings stored in the
1179 /// file.
1180 ///
1181 /// This is checked to be at least `1`.
1182 tzh_charcnt: usize,
1183}
1184
1185impl Header {
1186 /// Parse the header record from the given bytes.
1187 ///
1188 /// Upon success, return the header and all bytes after the header.
1189 ///
1190 /// The given `time_size` must be 4 or 8, corresponding to either the
1191 /// V1 header block or the V2+ header block, respectively.
1192 fn parse(
1193 time_size: usize,
1194 bytes: &[u8],
1195 ) -> Result<(Header, &[u8]), Error> {
1196 assert!(time_size == 4 || time_size == 8, "time size must be 4 or 8");
1197 if bytes.len() < 44 {
1198 return Err(err!("invalid header: too short"));
1199 }
1200 let (magic, rest) = bytes.split_at(4);
1201 if magic != b"TZif" {
1202 return Err(err!("invalid header: magic bytes mismatch"));
1203 }
1204 let (version, rest) = rest.split_at(1);
1205 let (_reserved, rest) = rest.split_at(15);
1206
1207 let (tzh_ttisutcnt_bytes, rest) = rest.split_at(4);
1208 let (tzh_ttisstdcnt_bytes, rest) = rest.split_at(4);
1209 let (tzh_leapcnt_bytes, rest) = rest.split_at(4);
1210 let (tzh_timecnt_bytes, rest) = rest.split_at(4);
1211 let (tzh_typecnt_bytes, rest) = rest.split_at(4);
1212 let (tzh_charcnt_bytes, rest) = rest.split_at(4);
1213
1214 let tzh_ttisutcnt = from_be_bytes_u32_to_usize(tzh_ttisutcnt_bytes)
1215 .map_err(|e| e.context("failed to parse tzh_ttisutcnt"))?;
1216 let tzh_ttisstdcnt = from_be_bytes_u32_to_usize(tzh_ttisstdcnt_bytes)
1217 .map_err(|e| e.context("failed to parse tzh_ttisstdcnt"))?;
1218 let tzh_leapcnt = from_be_bytes_u32_to_usize(tzh_leapcnt_bytes)
1219 .map_err(|e| e.context("failed to parse tzh_leapcnt"))?;
1220 let tzh_timecnt = from_be_bytes_u32_to_usize(tzh_timecnt_bytes)
1221 .map_err(|e| e.context("failed to parse tzh_timecnt"))?;
1222 let tzh_typecnt = from_be_bytes_u32_to_usize(tzh_typecnt_bytes)
1223 .map_err(|e| e.context("failed to parse tzh_typecnt"))?;
1224 let tzh_charcnt = from_be_bytes_u32_to_usize(tzh_charcnt_bytes)
1225 .map_err(|e| e.context("failed to parse tzh_charcnt"))?;
1226
1227 if tzh_ttisutcnt != 0 && tzh_ttisutcnt != tzh_typecnt {
1228 return Err(err!(
1229 "expected tzh_ttisutcnt={tzh_ttisutcnt} to be zero \
1230 or equal to tzh_typecnt={tzh_typecnt}",
1231 ));
1232 }
1233 if tzh_ttisstdcnt != 0 && tzh_ttisstdcnt != tzh_typecnt {
1234 return Err(err!(
1235 "expected tzh_ttisstdcnt={tzh_ttisstdcnt} to be zero \
1236 or equal to tzh_typecnt={tzh_typecnt}",
1237 ));
1238 }
1239 if tzh_typecnt < 1 {
1240 return Err(err!(
1241 "expected tzh_typecnt={tzh_typecnt} to be at least 1",
1242 ));
1243 }
1244 if tzh_charcnt < 1 {
1245 return Err(err!(
1246 "expected tzh_charcnt={tzh_charcnt} to be at least 1",
1247 ));
1248 }
1249
1250 let header = Header {
1251 time_size,
1252 version: version[0],
1253 tzh_ttisutcnt,
1254 tzh_ttisstdcnt,
1255 tzh_leapcnt,
1256 tzh_timecnt,
1257 tzh_typecnt,
1258 tzh_charcnt,
1259 };
1260 Ok((header, rest))
1261 }
1262
1263 /// Returns true if this header is for a 32-bit data block.
1264 ///
1265 /// When false, it is guaranteed that this header is for a 64-bit data
1266 /// block.
1267 fn is_32bit(&self) -> bool {
1268 self.time_size == 4
1269 }
1270
1271 /// Returns the size of the data block, in bytes, for this header.
1272 ///
1273 /// This returns an error if the arithmetic required to compute the
1274 /// length would overflow.
1275 ///
1276 /// This is useful for, e.g., skipping over the 32-bit V1 data block in
1277 /// V2+ TZif formatted files.
1278 fn data_block_len(&self) -> Result<usize, Error> {
1279 let a = self.transition_times_len()?;
1280 let b = self.transition_types_len()?;
1281 let c = self.local_time_types_len()?;
1282 let d = self.time_zone_designations_len()?;
1283 let e = self.leap_second_len()?;
1284 let f = self.standard_wall_len()?;
1285 let g = self.ut_local_len()?;
1286 a.checked_add(b)
1287 .and_then(|z| z.checked_add(c))
1288 .and_then(|z| z.checked_add(d))
1289 .and_then(|z| z.checked_add(e))
1290 .and_then(|z| z.checked_add(f))
1291 .and_then(|z| z.checked_add(g))
1292 .ok_or_else(|| {
1293 err!(
1294 "length of data block in V{} tzfile is too big",
1295 self.version
1296 )
1297 })
1298 }
1299
1300 fn transition_times_len(&self) -> Result<usize, Error> {
1301 self.tzh_timecnt.checked_mul(self.time_size).ok_or_else(|| {
1302 err!("tzh_timecnt value {} is too big", self.tzh_timecnt)
1303 })
1304 }
1305
1306 fn transition_types_len(&self) -> Result<usize, Error> {
1307 Ok(self.tzh_timecnt)
1308 }
1309
1310 fn local_time_types_len(&self) -> Result<usize, Error> {
1311 self.tzh_typecnt.checked_mul(6).ok_or_else(|| {
1312 err!("tzh_typecnt value {} is too big", self.tzh_typecnt)
1313 })
1314 }
1315
1316 fn time_zone_designations_len(&self) -> Result<usize, Error> {
1317 Ok(self.tzh_charcnt)
1318 }
1319
1320 fn leap_second_len(&self) -> Result<usize, Error> {
1321 let record_len = self
1322 .time_size
1323 .checked_add(4)
1324 .expect("4-or-8 plus 4 always fits in usize");
1325 self.tzh_leapcnt.checked_mul(record_len).ok_or_else(|| {
1326 err!("tzh_leapcnt value {} is too big", self.tzh_leapcnt)
1327 })
1328 }
1329
1330 fn standard_wall_len(&self) -> Result<usize, Error> {
1331 Ok(self.tzh_ttisstdcnt)
1332 }
1333
1334 fn ut_local_len(&self) -> Result<usize, Error> {
1335 Ok(self.tzh_ttisutcnt)
1336 }
1337}
1338
1339/// Does a quick check that returns true if the data might be in TZif format.
1340///
1341/// It is possible that this returns true even if the given data is not in TZif
1342/// format. However, it is impossible for this to return false when the given
1343/// data is TZif. That is, a false positive is allowed but a false negative is
1344/// not.
1345#[cfg(feature = "tzdb-zoneinfo")]
1346pub(crate) fn is_possibly_tzif(data: &[u8]) -> bool {
1347 data.starts_with(b"TZif")
1348}
1349
1350/// Interprets the given slice as an unsigned 32-bit big endian integer,
1351/// attempts to convert it to a `usize` and returns it.
1352///
1353/// # Panics
1354///
1355/// When `bytes.len() != 4`.
1356///
1357/// # Errors
1358///
1359/// This errors if the `u32` parsed from the given bytes cannot fit in a
1360/// `usize`.
1361fn from_be_bytes_u32_to_usize(bytes: &[u8]) -> Result<usize, Error> {
1362 let n = from_be_bytes_u32(bytes);
1363 usize::try_from(n).map_err(|_| {
1364 err!(
1365 "failed to parse integer {n} (too big, max allowed is {}",
1366 usize::MAX
1367 )
1368 })
1369}
1370
1371/// Interprets the given slice as an unsigned 32-bit big endian integer and
1372/// returns it.
1373///
1374/// # Panics
1375///
1376/// When `bytes.len() != 4`.
1377fn from_be_bytes_u32(bytes: &[u8]) -> u32 {
1378 u32::from_be_bytes(bytes.try_into().unwrap())
1379}
1380
1381/// Interprets the given slice as a signed 32-bit big endian integer and
1382/// returns it.
1383///
1384/// # Panics
1385///
1386/// When `bytes.len() != 4`.
1387fn from_be_bytes_i32(bytes: &[u8]) -> i32 {
1388 i32::from_be_bytes(bytes.try_into().unwrap())
1389}
1390
1391/// Interprets the given slice as a signed 64-bit big endian integer and
1392/// returns it.
1393///
1394/// # Panics
1395///
1396/// When `bytes.len() != 8`.
1397fn from_be_bytes_i64(bytes: &[u8]) -> i64 {
1398 i64::from_be_bytes(bytes.try_into().unwrap())
1399}
1400
1401/// Splits the given slice of bytes at the index given.
1402///
1403/// If the index is out of range (greater than `bytes.len()`) then an error is
1404/// returned. The error message will include the `what` string given, which is
1405/// meant to describe the thing being split.
1406fn try_split_at<'b>(
1407 what: &'static str,
1408 bytes: &'b [u8],
1409 at: usize,
1410) -> Result<(&'b [u8], &'b [u8]), Error> {
1411 if at > bytes.len() {
1412 Err(err!(
1413 "expected at least {at} bytes for {what}, \
1414 but found only {} bytes",
1415 bytes.len(),
1416 ))
1417 } else {
1418 Ok(bytes.split_at(at))
1419 }
1420}
1421
1422#[cfg(test)]
1423mod tests {
1424 use alloc::string::ToString;
1425
1426 use crate::tz::testdata::TZIF_TEST_FILES;
1427
1428 use super::*;
1429
1430 /// This converts TZif data into a human readable format.
1431 ///
1432 /// This is useful for debugging (via `./scripts/jiff-debug tzif`), but we
1433 /// also use it for snapshot testing to make reading the test output at
1434 /// least *somewhat* comprehensible for humans. Otherwise, one needs to
1435 /// read and understand Unix timestamps. That ain't going to fly.
1436 ///
1437 /// For this to work, we make sure everything in a `Tzif` value is
1438 /// represented in some way in this output.
1439 fn tzif_to_human_readable(tzif: &Tzif) -> String {
1440 use std::io::Write;
1441
1442 let mut out = tabwriter::TabWriter::new(vec![])
1443 .alignment(tabwriter::Alignment::Left);
1444
1445 writeln!(out, "TIME ZONE NAME").unwrap();
1446 writeln!(out, " {}", tzif.name().unwrap_or("UNNAMED")).unwrap();
1447
1448 writeln!(out, "TIME ZONE VERSION").unwrap();
1449 writeln!(out, " {}", char::try_from(tzif.version).unwrap()).unwrap();
1450
1451 writeln!(out, "LOCAL TIME TYPES").unwrap();
1452 for (i, typ) in tzif.types.iter().enumerate() {
1453 writeln!(
1454 out,
1455 " {i:03}:\toffset={off}\t\
1456 designation={desig}\t{dst}\tindicator={ind}",
1457 off = typ.offset,
1458 desig = tzif.designation(&typ),
1459 dst = if typ.is_dst.is_dst() { "dst" } else { "" },
1460 ind = typ.indicator,
1461 )
1462 .unwrap();
1463 }
1464 if !tzif.transitions.is_empty() {
1465 writeln!(out, "TRANSITIONS").unwrap();
1466 for (i, t) in tzif.transitions.iter().enumerate() {
1467 let dt = Offset::UTC.to_datetime(t.timestamp);
1468 let typ = &tzif.types[usize::from(t.type_index)];
1469 let wall = alloc::format!("{:?}", t.wall.start());
1470 let ambiguous = match t.wall {
1471 TransitionWall::Unambiguous { .. } => {
1472 "unambiguous".to_string()
1473 }
1474 TransitionWall::Gap { end, .. } => {
1475 alloc::format!(" gap-until({end:?})")
1476 }
1477 TransitionWall::Fold { end, .. } => {
1478 alloc::format!("fold-until({end:?})")
1479 }
1480 };
1481
1482 writeln!(
1483 out,
1484 " {i:04}:\t{dt:?}Z\tunix={ts}\twall={wall}\t\
1485 {ambiguous}\t\
1486 type={type_index}\t{off}\t\
1487 {desig}\t{dst}",
1488 ts = t.timestamp.as_second(),
1489 type_index = t.type_index,
1490 off = typ.offset,
1491 desig = tzif.designation(typ),
1492 dst = if typ.is_dst.is_dst() { "dst" } else { "" },
1493 )
1494 .unwrap();
1495 }
1496 }
1497 if !tzif.leap_seconds.is_empty() {
1498 writeln!(out, "LEAP SECONDS").unwrap();
1499 for ls in tzif.leap_seconds.iter() {
1500 let dt = Offset::UTC.to_datetime(ls.occurrence);
1501 let c = ls.correction;
1502 writeln!(out, " {dt:?}\tcorrection={c}").unwrap();
1503 }
1504 }
1505 if let Some(ref posix_tz) = tzif.posix_tz {
1506 writeln!(out, "POSIX TIME ZONE STRING").unwrap();
1507 writeln!(out, " {}", posix_tz).unwrap();
1508 }
1509 String::from_utf8(out.into_inner().unwrap()).unwrap()
1510 }
1511
1512 /// DEBUG COMMAND
1513 ///
1514 /// Takes environment variable `JIFF_DEBUG_TZIF_PATH` as input, and treats
1515 /// the value as a TZif file path. This test will open the file, parse it
1516 /// as a TZif and then dump debug data about the file in a human readable
1517 /// plain text format.
1518 #[cfg(feature = "std")]
1519 #[test]
1520 fn debug_tzif() -> anyhow::Result<()> {
1521 use anyhow::Context;
1522
1523 let _ = crate::logging::Logger::init();
1524
1525 const ENV: &str = "JIFF_DEBUG_TZIF_PATH";
1526 let Some(val) = std::env::var_os(ENV) else { return Ok(()) };
1527 let Ok(val) = val.into_string() else {
1528 anyhow::bail!("{ENV} has invalid UTF-8")
1529 };
1530 let bytes =
1531 std::fs::read(&val).with_context(|| alloc::format!("{val:?}"))?;
1532 let tzif = Tzif::parse(Some(val.to_string()), &bytes)?;
1533 std::eprint!("{}", tzif_to_human_readable(&tzif));
1534 Ok(())
1535 }
1536
1537 #[test]
1538 fn tzif_parse_v2plus() {
1539 for tzif_test in TZIF_TEST_FILES {
1540 insta::assert_snapshot!(
1541 alloc::format!("{}_v2+", tzif_test.name),
1542 tzif_to_human_readable(&tzif_test.parse())
1543 );
1544 }
1545 }
1546
1547 #[test]
1548 fn tzif_parse_v1() {
1549 for tzif_test in TZIF_TEST_FILES {
1550 insta::assert_snapshot!(
1551 alloc::format!("{}_v1", tzif_test.name),
1552 tzif_to_human_readable(&tzif_test.parse_v1())
1553 );
1554 }
1555 }
1556
1557 /// This tests walks the /usr/share/zoneinfo directory (if it exists) and
1558 /// tries to parse every TZif formatted file it can find. We don't really
1559 /// do much with it other than to ensure we don't panic or return an error.
1560 /// That is, we check that we can parse each file, but not that we do so
1561 /// correctly.
1562 #[cfg(feature = "tzdb-zoneinfo")]
1563 #[cfg(target_os = "linux")]
1564 #[test]
1565 fn zoneinfo() {
1566 const TZDIR: &str = "/usr/share/zoneinfo";
1567
1568 for result in walkdir::WalkDir::new(TZDIR) {
1569 // Just skip if we got an error traversing the directory tree.
1570 // These aren't related to our parsing, so it's some other problem
1571 // (like the directory not existing).
1572 let Ok(dent) = result else { continue };
1573 // This test can take some time in debug mode, so skip parsing
1574 // some of the less frequently used TZif files.
1575 let Some(name) = dent.path().to_str() else { continue };
1576 if name.contains("right/") || name.contains("posix/") {
1577 continue;
1578 }
1579 // Again, skip if we can't read. Not my monkeys, not my circus.
1580 let Ok(bytes) = std::fs::read(dent.path()) else { continue };
1581 if !is_possibly_tzif(&bytes) {
1582 continue;
1583 }
1584 let tzname = dent
1585 .path()
1586 .strip_prefix(TZDIR)
1587 .unwrap_or_else(|_| {
1588 panic!("all paths in TZDIR have {TZDIR:?} prefix")
1589 })
1590 .to_str()
1591 .expect("all paths to be valid UTF-8")
1592 .to_string();
1593 // OK at this point, we're pretty sure `bytes` should be a TZif
1594 // binary file. So try to parse it and fail the test if it fails.
1595 if let Err(err) = Tzif::parse(Some(tzname), &bytes) {
1596 panic!("failed to parse TZif file {:?}: {err}", dent.path());
1597 }
1598 }
1599 }
1600}