apache_avro/
rabin.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Implementation of the Rabin fingerprint algorithm
19use digest::{
20    FixedOutput, FixedOutputReset, HashMarker, Output, OutputSizeUser, Reset, Update, consts::U8,
21};
22use std::sync::OnceLock;
23
24const EMPTY: i64 = -4513414715797952619;
25
26fn fp_table() -> &'static [i64; 256] {
27    static FPTABLE_ONCE: OnceLock<[i64; 256]> = OnceLock::new();
28    FPTABLE_ONCE.get_or_init(|| {
29        let mut fp_table: [i64; 256] = [0; 256];
30        for i in 0..256 {
31            let mut fp = i;
32            for _ in 0..8 {
33                fp = (fp as u64 >> 1) as i64 ^ (EMPTY & -(fp & 1));
34            }
35            fp_table[i as usize] = fp;
36        }
37        fp_table
38    })
39}
40
41/// Implementation of the Rabin fingerprint algorithm using the [`Digest`](digest::Digest) trait as described in [schema fingerprints].
42///
43/// The digest is returned as the 8-byte little-endian encoding of the Rabin hash.
44/// This is what is used for Avro [single object encoding]
45///
46/// ```
47/// use apache_avro::rabin::Rabin;
48/// use digest::Digest;
49/// use hex_literal::hex;
50///
51/// // create the Rabin hasher
52/// let mut hasher = Rabin::new();
53///
54/// // add the data
55/// hasher.update(b"hello world");
56///
57/// // read hash digest and consume hasher
58/// let result = hasher.finalize();
59///
60/// assert_eq!(result[..], hex!("60335ba6d0415528"));
61/// ```
62///
63/// To convert the digest to the commonly used 64-bit integer value, you can use the [`i64::from_le_bytes()`] function
64///
65/// ```
66/// # use apache_avro::rabin::Rabin;
67/// # use digest::Digest;
68/// # use hex_literal::hex;
69///
70/// # let mut hasher = Rabin::new();
71///
72/// # hasher.update(b"hello world");
73///
74/// # let result = hasher.finalize();
75///
76/// # assert_eq!(result[..], hex!("60335ba6d0415528"));
77///
78/// let i = i64::from_le_bytes(result.try_into().unwrap());
79///
80/// assert_eq!(i, 2906301498937520992)
81/// ```
82/// [single object encoding](https://avro.apache.org/docs/++version++/specification/#single-object-encoding)
83/// [schema fingerprints](https://avro.apache.org/docs/++version++/specification/#schema-fingerprints)
84#[derive(Clone)]
85pub struct Rabin {
86    result: i64,
87}
88
89impl Default for Rabin {
90    fn default() -> Self {
91        Rabin { result: EMPTY }
92    }
93}
94
95impl Update for Rabin {
96    fn update(&mut self, data: &[u8]) {
97        for b in data {
98            self.result = (self.result as u64 >> 8) as i64
99                ^ fp_table()[((self.result ^ *b as i64) & 0xff) as usize];
100        }
101    }
102}
103
104impl FixedOutput for Rabin {
105    fn finalize_into(self, out: &mut Output<Self>) {
106        out.copy_from_slice(&self.result.to_le_bytes())
107    }
108}
109
110impl Reset for Rabin {
111    fn reset(&mut self) {
112        self.result = EMPTY;
113    }
114}
115
116impl OutputSizeUser for Rabin {
117    // 8-byte little-endian form of the i64
118    // See: https://avro.apache.org/docs/++version++/specification/#single-object-encoding
119    type OutputSize = U8;
120}
121
122impl HashMarker for Rabin {}
123
124impl FixedOutputReset for Rabin {
125    fn finalize_into_reset(&mut self, out: &mut Output<Self>) {
126        out.copy_from_slice(&self.result.to_le_bytes());
127        self.reset();
128    }
129}
130
131#[cfg(test)]
132mod tests {
133    use super::Rabin;
134    use apache_avro_test_helper::TestResult;
135    use digest::Digest;
136    use pretty_assertions::assert_eq;
137
138    // See: https://github.com/apache/avro/blob/main/share/test/data/schema-tests.txt
139    #[test]
140    fn test1() -> TestResult {
141        let data: &[(&str, i64)] = &[
142            (r#""null""#, 7195948357588979594),
143            (r#""boolean""#, -6970731678124411036),
144            (
145                r#"{"name":"foo","type":"fixed","size":15}"#,
146                1756455273707447556,
147            ),
148            (
149                r#"{"name":"PigValue","type":"record","fields":[{"name":"value","type":["null","int","long","PigValue"]}]}"#,
150                -1759257747318642341,
151            ),
152        ];
153
154        let mut hasher = Rabin::new();
155
156        for (s, fp) in data {
157            hasher.update(s.as_bytes());
158            let res: &[u8] = &hasher.finalize_reset();
159            let result = i64::from_le_bytes(res.try_into()?);
160            assert_eq!(*fp, result);
161        }
162
163        Ok(())
164    }
165}