apache_avro/rabin.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Implementation of the Rabin fingerprint algorithm
19use digest::{
20 FixedOutput, FixedOutputReset, HashMarker, Output, Reset, Update, consts::U8,
21 core_api::OutputSizeUser, generic_array::GenericArray,
22};
23use std::sync::OnceLock;
24
25const EMPTY: i64 = -4513414715797952619;
26
27fn fp_table() -> &'static [i64; 256] {
28 static FPTABLE_ONCE: OnceLock<[i64; 256]> = OnceLock::new();
29 FPTABLE_ONCE.get_or_init(|| {
30 let mut fp_table: [i64; 256] = [0; 256];
31 for i in 0..256 {
32 let mut fp = i;
33 for _ in 0..8 {
34 fp = (fp as u64 >> 1) as i64 ^ (EMPTY & -(fp & 1));
35 }
36 fp_table[i as usize] = fp;
37 }
38 fp_table
39 })
40}
41
42/// Implementation of the Rabin fingerprint algorithm using the [`Digest`](digest::Digest) trait as described in [schema fingerprints].
43///
44/// The digest is returned as the 8-byte little-endian encoding of the Rabin hash.
45/// This is what is used for Avro [single object encoding]
46///
47/// ```
48/// use apache_avro::rabin::Rabin;
49/// use digest::Digest;
50/// use hex_literal::hex;
51///
52/// // create the Rabin hasher
53/// let mut hasher = Rabin::new();
54///
55/// // add the data
56/// hasher.update(b"hello world");
57///
58/// // read hash digest and consume hasher
59/// let result = hasher.finalize();
60///
61/// assert_eq!(result[..], hex!("60335ba6d0415528"));
62/// ```
63///
64/// To convert the digest to the commonly used 64-bit integer value, you can use the [`i64::from_le_bytes()`] function
65///
66/// ```
67/// # use apache_avro::rabin::Rabin;
68/// # use digest::Digest;
69/// # use hex_literal::hex;
70///
71/// # let mut hasher = Rabin::new();
72///
73/// # hasher.update(b"hello world");
74///
75/// # let result = hasher.finalize();
76///
77/// # assert_eq!(result[..], hex!("60335ba6d0415528"));
78///
79/// let i = i64::from_le_bytes(result.try_into().unwrap());
80///
81/// assert_eq!(i, 2906301498937520992)
82/// ```
83/// [single object encoding](https://avro.apache.org/docs/++version++/specification/#single-object-encoding)
84/// [schema fingerprints](https://avro.apache.org/docs/++version++/specification/#schema-fingerprints)
85#[derive(Clone)]
86pub struct Rabin {
87 result: i64,
88}
89
90impl Default for Rabin {
91 fn default() -> Self {
92 Rabin { result: EMPTY }
93 }
94}
95
96impl Update for Rabin {
97 fn update(&mut self, data: &[u8]) {
98 for b in data {
99 self.result = (self.result as u64 >> 8) as i64
100 ^ fp_table()[((self.result ^ *b as i64) & 0xff) as usize];
101 }
102 }
103}
104
105impl FixedOutput for Rabin {
106 fn finalize_into(self, out: &mut GenericArray<u8, Self::OutputSize>) {
107 out.copy_from_slice(&self.result.to_le_bytes());
108 }
109}
110
111impl Reset for Rabin {
112 fn reset(&mut self) {
113 self.result = EMPTY;
114 }
115}
116
117impl OutputSizeUser for Rabin {
118 // 8-byte little-endian form of the i64
119 // See: https://avro.apache.org/docs/++version++/specification/#single-object-encoding
120 type OutputSize = U8;
121}
122
123impl HashMarker for Rabin {}
124
125impl FixedOutputReset for Rabin {
126 fn finalize_into_reset(&mut self, out: &mut Output<Self>) {
127 out.copy_from_slice(&self.result.to_le_bytes());
128 self.reset();
129 }
130}
131
132#[cfg(test)]
133mod tests {
134 use super::Rabin;
135 use apache_avro_test_helper::TestResult;
136 use digest::Digest;
137 use pretty_assertions::assert_eq;
138
139 // See: https://github.com/apache/avro/blob/main/share/test/data/schema-tests.txt
140 #[test]
141 fn test1() -> TestResult {
142 let data: &[(&str, i64)] = &[
143 (r#""null""#, 7195948357588979594),
144 (r#""boolean""#, -6970731678124411036),
145 (
146 r#"{"name":"foo","type":"fixed","size":15}"#,
147 1756455273707447556,
148 ),
149 (
150 r#"{"name":"PigValue","type":"record","fields":[{"name":"value","type":["null","int","long","PigValue"]}]}"#,
151 -1759257747318642341,
152 ),
153 ];
154
155 let mut hasher = Rabin::new();
156
157 for (s, fp) in data {
158 hasher.update(s.as_bytes());
159 let res: &[u8] = &hasher.finalize_reset();
160 let result = i64::from_le_bytes(res.try_into()?);
161 assert_eq!(*fp, result);
162 }
163
164 Ok(())
165 }
166}