apache_avro/
util.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Utility functions, like configuring various global settings.
19
20use crate::{AvroResult, error::Details, schema::Documentation};
21use serde_json::{Map, Value};
22use std::{
23    io::{Read, Write},
24    sync::OnceLock,
25};
26
27/// Maximum number of bytes that can be allocated when decoding Avro-encoded values.
28///
29/// This is a protection against ill-formed data, whose length field might be interpreted as enormous.
30///
31/// See [`max_allocation_bytes`] to change this limit.
32pub const DEFAULT_MAX_ALLOCATION_BYTES: usize = 512 * 1024 * 1024;
33static MAX_ALLOCATION_BYTES: OnceLock<usize> = OnceLock::new();
34
35/// Whether to set serialization & deserialization traits as `human_readable` or not.
36///
37/// See [`set_serde_human_readable`] to change this value.
38pub const DEFAULT_SERDE_HUMAN_READABLE: bool = false;
39/// Whether the serializer and deserializer should indicate to types that the format is human-readable.
40// crate-visible for testing
41pub(crate) static SERDE_HUMAN_READABLE: OnceLock<bool> = OnceLock::new();
42
43pub(crate) trait MapHelper {
44    fn string(&self, key: &str) -> Option<&str>;
45
46    fn name(&self) -> Option<&str> {
47        self.string("name")
48    }
49
50    fn doc(&self) -> Documentation {
51        self.string("doc").map(Into::into)
52    }
53
54    fn aliases(&self) -> Option<Vec<String>>;
55}
56
57impl MapHelper for Map<String, Value> {
58    fn string(&self, key: &str) -> Option<&str> {
59        self.get(key).and_then(|v| v.as_str())
60    }
61
62    fn aliases(&self) -> Option<Vec<String>> {
63        // FIXME no warning when aliases aren't a json array of json strings
64        self.get("aliases")
65            .and_then(|aliases| aliases.as_array())
66            .and_then(|aliases| {
67                aliases
68                    .iter()
69                    .map(|alias| alias.as_str())
70                    .map(|alias| alias.map(|a| a.to_string()))
71                    .collect::<Option<_>>()
72            })
73    }
74}
75
76pub(crate) fn read_long<R: Read>(reader: &mut R) -> AvroResult<i64> {
77    zag_i64(reader)
78}
79
80/// Write the number as a zigzagged varint to the writer.
81pub(crate) fn zig_i32<W: Write>(n: i32, buffer: W) -> AvroResult<usize> {
82    zig_i64(n as i64, buffer)
83}
84
85/// Write the number as a zigzagged varint to the writer.
86pub(crate) fn zig_i64<W: Write>(n: i64, writer: W) -> AvroResult<usize> {
87    let zigzagged = ((n << 1) ^ (n >> 63)) as u64;
88    encode_variable(zigzagged, writer)
89}
90
91/// Decode a zigzagged varint from the reader.
92pub(crate) fn zag_i32<R: Read>(reader: &mut R) -> AvroResult<i32> {
93    let i = zag_i64(reader)?;
94    i32::try_from(i).map_err(|e| Details::ZagI32(e, i).into())
95}
96
97/// Decode a zigzagged varint from the reader.
98pub(crate) fn zag_i64<R: Read>(reader: &mut R) -> AvroResult<i64> {
99    let z = decode_variable(reader)?;
100    Ok(if z & 0x1 == 0 {
101        (z >> 1) as i64
102    } else {
103        !(z >> 1) as i64
104    })
105}
106
107/// Write the number as a varint to the writer.
108///
109/// Note: this function does not do zigzag encoding, for that see [`zig_i32`] and [`zig_i64`].
110fn encode_variable<W: Write>(mut zigzagged: u64, mut writer: W) -> AvroResult<usize> {
111    // Ensure the number is little endian for the varint encoding (no-op on LE systems)
112    zigzagged = zigzagged.to_le();
113    // Encode the number as a varint
114    let mut buffer = [0u8; 10];
115    let mut i: usize = 0;
116    loop {
117        if zigzagged <= 0x7F {
118            buffer[i] = (zigzagged & 0x7F) as u8;
119            i += 1;
120            break;
121        } else {
122            buffer[i] = (0x80 | (zigzagged & 0x7F)) as u8;
123            i += 1;
124            zigzagged >>= 7;
125        }
126    }
127    writer
128        .write_all(&buffer[..i])
129        .map_err(Details::WriteBytes)?;
130    Ok(i)
131}
132
133/// Read a varint from the reader.
134///
135/// Note: this function does not do zigzag decoding, for that see [`zag_i32`] and [`zag_i64`].
136fn decode_variable<R: Read>(reader: &mut R) -> AvroResult<u64> {
137    let mut i = 0u64;
138    let mut buf = [0u8; 1];
139
140    let mut j = 0;
141    loop {
142        if j > 9 {
143            // if j * 7 > 64
144            return Err(Details::IntegerOverflow.into());
145        }
146        reader
147            .read_exact(&mut buf[..])
148            .map_err(Details::ReadVariableIntegerBytes)?;
149        i |= (u64::from(buf[0] & 0x7F)) << (j * 7);
150        if (buf[0] >> 7) == 0 {
151            break;
152        } else {
153            j += 1;
154        }
155    }
156
157    Ok(u64::from_le(i))
158}
159
160/// Set the maximum number of bytes that can be allocated when decoding data.
161///
162/// This function only changes the setting once. On subsequent calls the value will stay the same
163/// as the first time it is called. It is automatically called on first allocation and defaults to
164/// [`DEFAULT_MAX_ALLOCATION_BYTES`].
165///
166/// # Returns
167/// The configured maximum, which might be different from what the function was called with if the
168/// value was already set before.
169pub fn max_allocation_bytes(num_bytes: usize) -> usize {
170    *MAX_ALLOCATION_BYTES.get_or_init(|| num_bytes)
171}
172
173pub(crate) fn safe_len(len: usize) -> AvroResult<usize> {
174    let max_bytes = max_allocation_bytes(DEFAULT_MAX_ALLOCATION_BYTES);
175
176    if len <= max_bytes {
177        Ok(len)
178    } else {
179        Err(Details::MemoryAllocation {
180            desired: len,
181            maximum: max_bytes,
182        }
183        .into())
184    }
185}
186
187/// Set whether the serializer and deserializer should indicate to types that the format is human-readable.
188///
189/// This function only changes the setting once. On subsequent calls the value will stay the same
190/// as the first time it is called. It is automatically called on first allocation and defaults to
191/// [`DEFAULT_SERDE_HUMAN_READABLE`].
192///
193/// *NOTE*: Changing this setting can change the output of [`from_value`](crate::from_value) and the
194/// accepted input of [`to_value`](crate::to_value).
195///
196/// # Returns
197/// The configured human-readable value, which might be different from what the function was called
198/// with if the value was already set before.
199pub fn set_serde_human_readable(human_readable: bool) -> bool {
200    *SERDE_HUMAN_READABLE.get_or_init(|| human_readable)
201}
202
203pub(crate) fn is_human_readable() -> bool {
204    *SERDE_HUMAN_READABLE.get_or_init(|| DEFAULT_SERDE_HUMAN_READABLE)
205}
206
207#[cfg(test)]
208mod tests {
209    use super::*;
210    use apache_avro_test_helper::TestResult;
211    use pretty_assertions::assert_eq;
212
213    #[test]
214    fn test_zigzag() {
215        let mut a = Vec::new();
216        let mut b = Vec::new();
217        zig_i32(42i32, &mut a).unwrap();
218        zig_i64(42i64, &mut b).unwrap();
219        assert_eq!(a, b);
220    }
221
222    #[test]
223    fn test_zig_i64() {
224        let mut s = Vec::new();
225
226        zig_i64(0, &mut s).unwrap();
227        assert_eq!(s, [0]);
228
229        s.clear();
230        zig_i64(-1, &mut s).unwrap();
231        assert_eq!(s, [1]);
232
233        s.clear();
234        zig_i64(1, &mut s).unwrap();
235        assert_eq!(s, [2]);
236
237        s.clear();
238        zig_i64(-64, &mut s).unwrap();
239        assert_eq!(s, [127]);
240
241        s.clear();
242        zig_i64(64, &mut s).unwrap();
243        assert_eq!(s, [128, 1]);
244
245        s.clear();
246        zig_i64(i32::MAX as i64, &mut s).unwrap();
247        assert_eq!(s, [254, 255, 255, 255, 15]);
248
249        s.clear();
250        zig_i64(i32::MAX as i64 + 1, &mut s).unwrap();
251        assert_eq!(s, [128, 128, 128, 128, 16]);
252
253        s.clear();
254        zig_i64(i32::MIN as i64, &mut s).unwrap();
255        assert_eq!(s, [255, 255, 255, 255, 15]);
256
257        s.clear();
258        zig_i64(i32::MIN as i64 - 1, &mut s).unwrap();
259        assert_eq!(s, [129, 128, 128, 128, 16]);
260
261        s.clear();
262        zig_i64(i64::MAX, &mut s).unwrap();
263        assert_eq!(s, [254, 255, 255, 255, 255, 255, 255, 255, 255, 1]);
264
265        s.clear();
266        zig_i64(i64::MIN, &mut s).unwrap();
267        assert_eq!(s, [255, 255, 255, 255, 255, 255, 255, 255, 255, 1]);
268    }
269
270    #[test]
271    fn test_zig_i32() {
272        let mut s = Vec::new();
273        zig_i32(i32::MAX / 2, &mut s).unwrap();
274        assert_eq!(s, [254, 255, 255, 255, 7]);
275
276        s.clear();
277        zig_i32(i32::MIN / 2, &mut s).unwrap();
278        assert_eq!(s, [255, 255, 255, 255, 7]);
279
280        s.clear();
281        zig_i32(-(i32::MIN / 2), &mut s).unwrap();
282        assert_eq!(s, [128, 128, 128, 128, 8]);
283
284        s.clear();
285        zig_i32(i32::MIN / 2 - 1, &mut s).unwrap();
286        assert_eq!(s, [129, 128, 128, 128, 8]);
287
288        s.clear();
289        zig_i32(i32::MAX, &mut s).unwrap();
290        assert_eq!(s, [254, 255, 255, 255, 15]);
291
292        s.clear();
293        zig_i32(i32::MIN, &mut s).unwrap();
294        assert_eq!(s, [255, 255, 255, 255, 15]);
295    }
296
297    #[test]
298    fn test_overflow() {
299        let causes_left_shift_overflow: &[u8] = &[0xe1; 10];
300        assert!(matches!(
301            decode_variable(&mut &*causes_left_shift_overflow)
302                .unwrap_err()
303                .details(),
304            Details::IntegerOverflow
305        ));
306    }
307
308    #[test]
309    fn test_safe_len() -> TestResult {
310        assert_eq!(42usize, safe_len(42usize)?);
311        assert!(safe_len(1024 * 1024 * 1024).is_err());
312
313        Ok(())
314    }
315}