unicode_normalization/
normalize.rs

1// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11//! Functions for computing canonical and compatible decompositions for Unicode characters.
12
13use std::cmp::Ordering::{Equal, Less, Greater};
14use std::ops::FnMut;
15use tables::normalization::{canonical_table, compatibility_table, composition_table};
16
17fn bsearch_table<T>(c: char, r: &'static [(char, &'static [T])]) -> Option<&'static [T]> {
18    match r.binary_search_by(|&(val, _)| {
19        if c == val { Equal }
20        else if val < c { Less }
21        else { Greater }
22    }) {
23        Ok(idx) => {
24            let (_, result) = r[idx];
25            Some(result)
26        }
27        Err(_) => None
28    }
29}
30
31/// Compute canonical Unicode decomposition for character.
32/// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/)
33/// for more information.
34pub fn decompose_canonical<F>(c: char, mut i: F) where F: FnMut(char) { d(c, &mut i, false); }
35
36/// Compute canonical or compatible Unicode decomposition for character.
37/// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/)
38/// for more information.
39pub fn decompose_compatible<F>(c: char, mut i: F) where F: FnMut(char) { d(c, &mut i, true); }
40
41// FIXME(#19596) This is a workaround, we should use `F` instead of `&mut F`
42fn d<F>(c: char, i: &mut F, k: bool) where F: FnMut(char) {
43    // 7-bit ASCII never decomposes
44    if c <= '\x7f' { (*i)(c); return; }
45
46    // Perform decomposition for Hangul
47    if (c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT) {
48        decompose_hangul(c, i);
49        return;
50    }
51
52    // First check the canonical decompositions
53    match bsearch_table(c, canonical_table) {
54        Some(canon) => {
55            for x in canon {
56                d(*x, i, k);
57            }
58            return;
59        }
60        None => ()
61    }
62
63    // Bottom out if we're not doing compat.
64    if !k { (*i)(c); return; }
65
66    // Then check the compatibility decompositions
67    match bsearch_table(c, compatibility_table) {
68        Some(compat) => {
69            for x in compat {
70                d(*x, i, k);
71            }
72            return;
73        }
74        None => ()
75    }
76
77    // Finally bottom out.
78    (*i)(c);
79}
80
81/// Compose two characters into a single character, if possible.
82/// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/)
83/// for more information.
84pub fn compose(a: char, b: char) -> Option<char> {
85    compose_hangul(a, b).or_else(|| {
86        match bsearch_table(a, composition_table) {
87            None => None,
88            Some(candidates) => {
89                match candidates.binary_search_by(|&(val, _)| {
90                    if b == val { Equal }
91                    else if val < b { Less }
92                    else { Greater }
93                }) {
94                    Ok(idx) => {
95                        let (_, result) = candidates[idx];
96                        Some(result)
97                    }
98                    Err(_) => None
99                }
100            }
101        }
102    })
103}
104
105// Constants from Unicode 7.0.0 Section 3.12 Conjoining Jamo Behavior
106const S_BASE: u32 = 0xAC00;
107const L_BASE: u32 = 0x1100;
108const V_BASE: u32 = 0x1161;
109const T_BASE: u32 = 0x11A7;
110const L_COUNT: u32 = 19;
111const V_COUNT: u32 = 21;
112const T_COUNT: u32 = 28;
113const N_COUNT: u32 = (V_COUNT * T_COUNT);
114const S_COUNT: u32 = (L_COUNT * N_COUNT);
115
116// FIXME(#19596) This is a workaround, we should use `F` instead of `&mut F`
117// Decompose a precomposed Hangul syllable
118#[allow(unsafe_code)]
119#[inline(always)]
120fn decompose_hangul<F>(s: char, f: &mut F) where F: FnMut(char) {
121    use std::mem::transmute;
122
123    let si = s as u32 - S_BASE;
124
125    let li = si / N_COUNT;
126    unsafe {
127        (*f)(transmute(L_BASE + li));
128
129        let vi = (si % N_COUNT) / T_COUNT;
130        (*f)(transmute(V_BASE + vi));
131
132        let ti = si % T_COUNT;
133        if ti > 0 {
134            (*f)(transmute(T_BASE + ti));
135        }
136    }
137}
138
139// Compose a pair of Hangul Jamo
140#[allow(unsafe_code)]
141#[inline(always)]
142fn compose_hangul(a: char, b: char) -> Option<char> {
143    use std::mem::transmute;
144
145    let l = a as u32;
146    let v = b as u32;
147    // Compose an LPart and a VPart
148    if L_BASE <= l && l < (L_BASE + L_COUNT) && V_BASE <= v && v < (V_BASE + V_COUNT) {
149        let r = S_BASE + (l - L_BASE) * N_COUNT + (v - V_BASE) * T_COUNT;
150        return unsafe { Some(transmute(r)) };
151    }
152    // Compose an LVPart and a TPart
153    if S_BASE <= l && l <= (S_BASE+S_COUNT-T_COUNT) && T_BASE <= v && v < (T_BASE+T_COUNT) {
154        let r = l + (v - T_BASE);
155        return unsafe { Some(transmute(r)) };
156    }
157    None
158}