apache_avro/rabin.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Implementation of the Rabin fingerprint algorithm
19use digest::{
20 FixedOutput, FixedOutputReset, HashMarker, Output, OutputSizeUser, Reset, Update, consts::U8,
21};
22use std::sync::OnceLock;
23
24const EMPTY: i64 = -4513414715797952619;
25
26fn fp_table() -> &'static [i64; 256] {
27 static FPTABLE_ONCE: OnceLock<[i64; 256]> = OnceLock::new();
28 FPTABLE_ONCE.get_or_init(|| {
29 let mut fp_table: [i64; 256] = [0; 256];
30 for i in 0..256 {
31 let mut fp = i;
32 for _ in 0..8 {
33 fp = (fp as u64 >> 1) as i64 ^ (EMPTY & -(fp & 1));
34 }
35 fp_table[i as usize] = fp;
36 }
37 fp_table
38 })
39}
40
41/// Implementation of the Rabin fingerprint algorithm using the [`Digest`](digest::Digest) trait as described in [schema fingerprints].
42///
43/// The digest is returned as the 8-byte little-endian encoding of the Rabin hash.
44/// This is what is used for Avro [single object encoding]
45///
46/// ```
47/// use apache_avro::rabin::Rabin;
48/// use digest::Digest;
49/// use hex_literal::hex;
50///
51/// // create the Rabin hasher
52/// let mut hasher = Rabin::new();
53///
54/// // add the data
55/// hasher.update(b"hello world");
56///
57/// // read hash digest and consume hasher
58/// let result = hasher.finalize();
59///
60/// assert_eq!(result[..], hex!("60335ba6d0415528"));
61/// ```
62///
63/// To convert the digest to the commonly used 64-bit integer value, you can use the [`i64::from_le_bytes()`] function
64///
65/// ```
66/// # use apache_avro::rabin::Rabin;
67/// # use digest::Digest;
68/// # use hex_literal::hex;
69///
70/// # let mut hasher = Rabin::new();
71///
72/// # hasher.update(b"hello world");
73///
74/// # let result = hasher.finalize();
75///
76/// # assert_eq!(result[..], hex!("60335ba6d0415528"));
77///
78/// let i = i64::from_le_bytes(result.try_into().unwrap());
79///
80/// assert_eq!(i, 2906301498937520992)
81/// ```
82/// [single object encoding](https://avro.apache.org/docs/++version++/specification/#single-object-encoding)
83/// [schema fingerprints](https://avro.apache.org/docs/++version++/specification/#schema-fingerprints)
84#[derive(Clone)]
85pub struct Rabin {
86 result: i64,
87}
88
89impl Default for Rabin {
90 fn default() -> Self {
91 Rabin { result: EMPTY }
92 }
93}
94
95impl Update for Rabin {
96 fn update(&mut self, data: &[u8]) {
97 for b in data {
98 self.result = (self.result as u64 >> 8) as i64
99 ^ fp_table()[((self.result ^ *b as i64) & 0xff) as usize];
100 }
101 }
102}
103
104impl FixedOutput for Rabin {
105 fn finalize_into(self, out: &mut Output<Self>) {
106 out.copy_from_slice(&self.result.to_le_bytes())
107 }
108}
109
110impl Reset for Rabin {
111 fn reset(&mut self) {
112 self.result = EMPTY;
113 }
114}
115
116impl OutputSizeUser for Rabin {
117 // 8-byte little-endian form of the i64
118 // See: https://avro.apache.org/docs/++version++/specification/#single-object-encoding
119 type OutputSize = U8;
120}
121
122impl HashMarker for Rabin {}
123
124impl FixedOutputReset for Rabin {
125 fn finalize_into_reset(&mut self, out: &mut Output<Self>) {
126 out.copy_from_slice(&self.result.to_le_bytes());
127 self.reset();
128 }
129}
130
131#[cfg(test)]
132mod tests {
133 use super::Rabin;
134 use apache_avro_test_helper::TestResult;
135 use digest::Digest;
136 use pretty_assertions::assert_eq;
137
138 // See: https://github.com/apache/avro/blob/main/share/test/data/schema-tests.txt
139 #[test]
140 fn test1() -> TestResult {
141 let data: &[(&str, i64)] = &[
142 (r#""null""#, 7195948357588979594),
143 (r#""boolean""#, -6970731678124411036),
144 (
145 r#"{"name":"foo","type":"fixed","size":15}"#,
146 1756455273707447556,
147 ),
148 (
149 r#"{"name":"PigValue","type":"record","fields":[{"name":"value","type":["null","int","long","PigValue"]}]}"#,
150 -1759257747318642341,
151 ),
152 ];
153
154 let mut hasher = Rabin::new();
155
156 for (s, fp) in data {
157 hasher.update(s.as_bytes());
158 let res: &[u8] = &hasher.finalize_reset();
159 let result = i64::from_le_bytes(res.try_into()?);
160 assert_eq!(*fp, result);
161 }
162
163 Ok(())
164 }
165}