apache_avro/lib.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [![rust continuous integration][rust continuous integration img]][rust continuous integration]
19//! [![rust clippy check][rust clippy check img]][rust clippy check]
20//! [![rust security audit][rust security audit img]][rust security audit]
21//! [![rust continuous integration ARM64][rust continuous integration ARM64 img]][rust continuous integration ARM64]
22//!
23//! [rust continuous integration]: https://github.com/apache/avro-rs/actions/workflows/test-lang-rust-ci.yml
24//! [rust continuous integration ARM64]: https://github.com/apache/avro-rs/actions/workflows/test-lang-rust-ci-ARM.yml
25//! [rust clippy check]: https://github.com/apache/avro-rs/actions/workflows/test-lang-rust-clippy.yml
26//! [rust security audit]: https://github.com/apache/avro-rs/actions/workflows/test-lang-rust-audit.yml
27//!
28//! [rust continuous integration img]: https://github.com/apache/avro-rs/actions/workflows/test-lang-rust-ci.yml/badge.svg
29//! [rust clippy check img]: https://github.com/apache/avro-rs/actions/workflows/test-lang-rust-clippy.yml/badge.svg
30//! [rust security audit img]: https://github.com/apache/avro-rs/actions/workflows/test-lang-rust-audit.yml/badge.svg
31//! [rust continuous integration ARM64 img]: https://github.com/apache/avro-rs/actions/workflows/test-lang-rust-ci-ARM.yml/badge.svg
32//!
33//! A library for working with [Apache Avro](https://avro.apache.org/) in Rust.
34//!
35//! Please check our [documentation](https://docs.rs/apache-avro) for examples, tutorials and API reference.
36//!
37//! **[Apache Avro](https://avro.apache.org/)** is a data serialization system which provides rich
38//! data structures and a compact, fast, binary data format.
39//!
40//! All data in Avro is schematized, as in the following example:
41//!
42//! ```json
43//! {
44//! "type": "record",
45//! "name": "test",
46//! "fields": [
47//! {"name": "a", "type": "long", "default": 42},
48//! {"name": "b", "type": "string"}
49//! ]
50//! }
51//! ```
52//!
53//! There are basically two ways of handling Avro data in Rust:
54//!
55//! * **as Avro-specialized data types** based on an Avro schema;
56//! * **as generic Rust serde-compatible types** implementing/deriving `Serialize` and `Deserialize`;
57//!
58//! **apache-avro** provides a way to read and write both these data representations easily and
59//! efficiently.
60//!
61//! # Installing the library
62//!
63//!
64//! Add to your `Cargo.toml`:
65//!
66//! ```toml
67//! [dependencies]
68//! apache-avro = "x.y"
69//! ```
70//!
71//! Or in case you want to leverage the **Snappy** codec:
72//!
73//! ```toml
74//! [dependencies.apache-avro]
75//! version = "x.y"
76//! features = ["snappy"]
77//! ```
78//!
79//! Or in case you want to leverage the **Zstandard** codec:
80//!
81//! ```toml
82//! [dependencies.apache-avro]
83//! version = "x.y"
84//! features = ["zstandard"]
85//! ```
86//!
87//! Or in case you want to leverage the **Bzip2** codec:
88//!
89//! ```toml
90//! [dependencies.apache-avro]
91//! version = "x.y"
92//! features = ["bzip"]
93//! ```
94//!
95//! Or in case you want to leverage the **Xz** codec:
96//!
97//! ```toml
98//! [dependencies.apache-avro]
99//! version = "x.y"
100//! features = ["xz"]
101//! ```
102//!
103//! # Upgrading to a newer minor version
104//!
105//! The library is still in beta, so there might be backward-incompatible changes between minor
106//! versions. If you have troubles upgrading, check the release notes.
107//!
108//! # Minimum supported Rust version
109//!
110//! 1.74.0
111//!
112//! # Defining a schema
113//!
114//! An Avro data cannot exist without an Avro schema. Schemas **must** be used while writing and
115//! **can** be used while reading and they carry the information regarding the type of data we are
116//! handling. Avro schemas are used for both schema validation and resolution of Avro data.
117//!
118//! Avro schemas are defined in **JSON** format and can just be parsed out of a raw string:
119//!
120//! ```
121//! use apache_avro::Schema;
122//!
123//! let raw_schema = r#"
124//! {
125//! "type": "record",
126//! "name": "test",
127//! "fields": [
128//! {"name": "a", "type": "long", "default": 42},
129//! {"name": "b", "type": "string"}
130//! ]
131//! }
132//! "#;
133//!
134//! // if the schema is not valid, this function will return an error
135//! let schema = Schema::parse_str(raw_schema).unwrap();
136//!
137//! // schemas can be printed for debugging
138//! println!("{:?}", schema);
139//! ```
140//!
141//! Additionally, a list of of definitions (which may depend on each other) can be given and all of
142//! them will be parsed into the corresponding schemas.
143//!
144//! ```
145//! use apache_avro::Schema;
146//!
147//! let raw_schema_1 = r#"{
148//! "name": "A",
149//! "type": "record",
150//! "fields": [
151//! {"name": "field_one", "type": "float"}
152//! ]
153//! }"#;
154//!
155//! // This definition depends on the definition of A above
156//! let raw_schema_2 = r#"{
157//! "name": "B",
158//! "type": "record",
159//! "fields": [
160//! {"name": "field_one", "type": "A"}
161//! ]
162//! }"#;
163//!
164//! // if the schemas are not valid, this function will return an error
165//! let schemas = Schema::parse_list(&[raw_schema_1, raw_schema_2]).unwrap();
166//!
167//! // schemas can be printed for debugging
168//! println!("{:?}", schemas);
169//! ```
170//! *N.B.* It is important to note that the composition of schema definitions requires schemas with names.
171//! For this reason, only schemas of type Record, Enum, and Fixed should be input into this function.
172//!
173//! The library provides also a programmatic interface to define schemas without encoding them in
174//! JSON (for advanced use), but we highly recommend the JSON interface. Please read the API
175//! reference in case you are interested.
176//!
177//! For more information about schemas and what kind of information you can encapsulate in them,
178//! please refer to the appropriate section of the
179//! [Avro Specification](https://avro.apache.org/docs/current/specification/#schema-declaration).
180//!
181//! # Writing data
182//!
183//! Once we have defined a schema, we are ready to serialize data in Avro, validating them against
184//! the provided schema in the process. As mentioned before, there are two ways of handling Avro
185//! data in Rust.
186//!
187//! **NOTE:** The library also provides a low-level interface for encoding a single datum in Avro
188//! bytecode without generating markers and headers (for advanced use), but we highly recommend the
189//! `Writer` interface to be totally Avro-compatible. Please read the API reference in case you are
190//! interested.
191//!
192//! ## The avro way
193//!
194//! Given that the schema we defined above is that of an Avro *Record*, we are going to use the
195//! associated type provided by the library to specify the data we want to serialize:
196//!
197//! ```
198//! # use apache_avro::Schema;
199//! use apache_avro::types::Record;
200//! use apache_avro::Writer;
201//! #
202//! # let raw_schema = r#"
203//! # {
204//! # "type": "record",
205//! # "name": "test",
206//! # "fields": [
207//! # {"name": "a", "type": "long", "default": 42},
208//! # {"name": "b", "type": "string"}
209//! # ]
210//! # }
211//! # "#;
212//! # let schema = Schema::parse_str(raw_schema).unwrap();
213//! // a writer needs a schema and something to write to
214//! let mut writer = Writer::new(&schema, Vec::new());
215//!
216//! // the Record type models our Record schema
217//! let mut record = Record::new(writer.schema()).unwrap();
218//! record.put("a", 27i64);
219//! record.put("b", "foo");
220//!
221//! // schema validation happens here
222//! writer.append(record).unwrap();
223//!
224//! // this is how to get back the resulting avro bytecode
225//! // this performs a flush operation to make sure data has been written, so it can fail
226//! // you can also call `writer.flush()` yourself without consuming the writer
227//! let encoded = writer.into_inner().unwrap();
228//! ```
229//!
230//! The vast majority of the times, schemas tend to define a record as a top-level container
231//! encapsulating all the values to convert as fields and providing documentation for them, but in
232//! case we want to directly define an Avro value, the library offers that capability via the
233//! `Value` interface.
234//!
235//! ```
236//! use apache_avro::types::Value;
237//!
238//! let mut value = Value::String("foo".to_string());
239//! ```
240//!
241//! ## The serde way
242//!
243//! Given that the schema we defined above is an Avro *Record*, we can directly use a Rust struct
244//! deriving `Serialize` to model our data:
245//!
246//! ```
247//! # use apache_avro::Schema;
248//! # use serde::Serialize;
249//! use apache_avro::Writer;
250//!
251//! #[derive(Debug, Serialize)]
252//! struct Test {
253//! a: i64,
254//! b: String,
255//! }
256//!
257//! # let raw_schema = r#"
258//! # {
259//! # "type": "record",
260//! # "name": "test",
261//! # "fields": [
262//! # {"name": "a", "type": "long", "default": 42},
263//! # {"name": "b", "type": "string"}
264//! # ]
265//! # }
266//! # "#;
267//! # let schema = Schema::parse_str(raw_schema).unwrap();
268//! // a writer needs a schema and something to write to
269//! let mut writer = Writer::new(&schema, Vec::new());
270//!
271//! // the structure models our Record schema
272//! let test = Test {
273//! a: 27,
274//! b: "foo".to_owned(),
275//! };
276//!
277//! // schema validation happens here
278//! writer.append_ser(test).unwrap();
279//!
280//! // this is how to get back the resulting avro bytecode
281//! // this performs a flush operation to make sure data is written, so it can fail
282//! // you can also call `writer.flush()` yourself without consuming the writer
283//! let encoded = writer.into_inner();
284//! ```
285//!
286//! The vast majority of the times, schemas tend to define a record as a top-level container
287//! encapsulating all the values to convert as fields and providing documentation for them, but in
288//! case we want to directly define an Avro value, any type implementing `Serialize` should work.
289//!
290//! ```
291//! let mut value = "foo".to_string();
292//! ```
293//!
294//! ## Using codecs to compress data
295//!
296//! Avro supports three different compression codecs when encoding data:
297//!
298//! * **Null**: leaves data uncompressed;
299//! * **Deflate**: writes the data block using the deflate algorithm as specified in RFC 1951, and
300//! typically implemented using the zlib library. Note that this format (unlike the "zlib format" in
301//! RFC 1950) does not have a checksum.
302//! * **Snappy**: uses Google's [Snappy](http://google.github.io/snappy/) compression library. Each
303//! compressed block is followed by the 4-byte, big-endianCRC32 checksum of the uncompressed data in
304//! the block. You must enable the `snappy` feature to use this codec.
305//! * **Zstandard**: uses Facebook's [Zstandard](https://facebook.github.io/zstd/) compression library.
306//! You must enable the `zstandard` feature to use this codec.
307//! * **Bzip2**: uses [BZip2](https://sourceware.org/bzip2/) compression library.
308//! You must enable the `bzip` feature to use this codec.
309//! * **Xz**: uses [xz2](https://github.com/alexcrichton/xz2-rs) compression library.
310//! You must enable the `xz` feature to use this codec.
311//!
312//! To specify a codec to use to compress data, just specify it while creating a `Writer`:
313//! ```
314//! use apache_avro::{Codec, DeflateSettings, Schema, Writer};
315//! #
316//! # let raw_schema = r#"
317//! # {
318//! # "type": "record",
319//! # "name": "test",
320//! # "fields": [
321//! # {"name": "a", "type": "long", "default": 42},
322//! # {"name": "b", "type": "string"}
323//! # ]
324//! # }
325//! # "#;
326//! # let schema = Schema::parse_str(raw_schema).unwrap();
327//! let mut writer = Writer::with_codec(&schema, Vec::new(), Codec::Deflate(DeflateSettings::default()));
328//! ```
329//!
330//! # Reading data
331//!
332//! As far as reading Avro encoded data goes, we can just use the schema encoded with the data to
333//! read them. The library will do it automatically for us, as it already does for the compression
334//! codec:
335//!
336//! ```
337//! use apache_avro::Reader;
338//! # use apache_avro::Schema;
339//! # use apache_avro::types::Record;
340//! # use apache_avro::Writer;
341//! #
342//! # let raw_schema = r#"
343//! # {
344//! # "type": "record",
345//! # "name": "test",
346//! # "fields": [
347//! # {"name": "a", "type": "long", "default": 42},
348//! # {"name": "b", "type": "string"}
349//! # ]
350//! # }
351//! # "#;
352//! # let schema = Schema::parse_str(raw_schema).unwrap();
353//! # let mut writer = Writer::new(&schema, Vec::new());
354//! # let mut record = Record::new(writer.schema()).unwrap();
355//! # record.put("a", 27i64);
356//! # record.put("b", "foo");
357//! # writer.append(record).unwrap();
358//! # let input = writer.into_inner().unwrap();
359//! // reader creation can fail in case the input to read from is not Avro-compatible or malformed
360//! let reader = Reader::new(&input[..]).unwrap();
361//! ```
362//!
363//! In case, instead, we want to specify a different (but compatible) reader schema from the schema
364//! the data has been written with, we can just do as the following:
365//! ```
366//! use apache_avro::Schema;
367//! use apache_avro::Reader;
368//! # use apache_avro::types::Record;
369//! # use apache_avro::Writer;
370//! #
371//! # let writer_raw_schema = r#"
372//! # {
373//! # "type": "record",
374//! # "name": "test",
375//! # "fields": [
376//! # {"name": "a", "type": "long", "default": 42},
377//! # {"name": "b", "type": "string"}
378//! # ]
379//! # }
380//! # "#;
381//! # let writer_schema = Schema::parse_str(writer_raw_schema).unwrap();
382//! # let mut writer = Writer::new(&writer_schema, Vec::new());
383//! # let mut record = Record::new(writer.schema()).unwrap();
384//! # record.put("a", 27i64);
385//! # record.put("b", "foo");
386//! # writer.append(record).unwrap();
387//! # let input = writer.into_inner().unwrap();
388//!
389//! let reader_raw_schema = r#"
390//! {
391//! "type": "record",
392//! "name": "test",
393//! "fields": [
394//! {"name": "a", "type": "long", "default": 42},
395//! {"name": "b", "type": "string"},
396//! {"name": "c", "type": "long", "default": 43}
397//! ]
398//! }
399//! "#;
400//!
401//! let reader_schema = Schema::parse_str(reader_raw_schema).unwrap();
402//!
403//! // reader creation can fail in case the input to read from is not Avro-compatible or malformed
404//! let reader = Reader::with_schema(&reader_schema, &input[..]).unwrap();
405//! ```
406//!
407//! The library will also automatically perform schema resolution while reading the data.
408//!
409//! For more information about schema compatibility and resolution, please refer to the
410//! [Avro Specification](https://avro.apache.org/docs/current/specification/#schema-declaration).
411//!
412//! As usual, there are two ways to handle Avro data in Rust, as you can see below.
413//!
414//! **NOTE:** The library also provides a low-level interface for decoding a single datum in Avro
415//! bytecode without markers and header (for advanced use), but we highly recommend the `Reader`
416//! interface to leverage all Avro features. Please read the API reference in case you are
417//! interested.
418//!
419//!
420//! ## The avro way
421//!
422//! We can just read directly instances of `Value` out of the `Reader` iterator:
423//!
424//! ```
425//! # use apache_avro::Schema;
426//! # use apache_avro::types::Record;
427//! # use apache_avro::Writer;
428//! use apache_avro::Reader;
429//! #
430//! # let raw_schema = r#"
431//! # {
432//! # "type": "record",
433//! # "name": "test",
434//! # "fields": [
435//! # {"name": "a", "type": "long", "default": 42},
436//! # {"name": "b", "type": "string"}
437//! # ]
438//! # }
439//! # "#;
440//! # let schema = Schema::parse_str(raw_schema).unwrap();
441//! # let schema = Schema::parse_str(raw_schema).unwrap();
442//! # let mut writer = Writer::new(&schema, Vec::new());
443//! # let mut record = Record::new(writer.schema()).unwrap();
444//! # record.put("a", 27i64);
445//! # record.put("b", "foo");
446//! # writer.append(record).unwrap();
447//! # let input = writer.into_inner().unwrap();
448//! let reader = Reader::new(&input[..]).unwrap();
449//!
450//! // value is a Result of an Avro Value in case the read operation fails
451//! for value in reader {
452//! println!("{:?}", value.unwrap());
453//! }
454//!
455//! ```
456//!
457//! ## The serde way
458//!
459//! Alternatively, we can use a Rust type implementing `Deserialize` and representing our schema to
460//! read the data into:
461//!
462//! ```
463//! # use apache_avro::Schema;
464//! # use apache_avro::Writer;
465//! # use serde::{Deserialize, Serialize};
466//! use apache_avro::Reader;
467//! use apache_avro::from_value;
468//!
469//! # #[derive(Serialize)]
470//! #[derive(Debug, Deserialize)]
471//! struct Test {
472//! a: i64,
473//! b: String,
474//! }
475//!
476//! # let raw_schema = r#"
477//! # {
478//! # "type": "record",
479//! # "name": "test",
480//! # "fields": [
481//! # {"name": "a", "type": "long", "default": 42},
482//! # {"name": "b", "type": "string"}
483//! # ]
484//! # }
485//! # "#;
486//! # let schema = Schema::parse_str(raw_schema).unwrap();
487//! # let mut writer = Writer::new(&schema, Vec::new());
488//! # let test = Test {
489//! # a: 27,
490//! # b: "foo".to_owned(),
491//! # };
492//! # writer.append_ser(test).unwrap();
493//! # let input = writer.into_inner().unwrap();
494//! let reader = Reader::new(&input[..]).unwrap();
495//!
496//! // value is a Result in case the read operation fails
497//! for value in reader {
498//! println!("{:?}", from_value::<Test>(&value.unwrap()));
499//! }
500//! ```
501//!
502//! # Putting everything together
503//!
504//! The following is an example of how to combine everything showed so far and it is meant to be a
505//! quick reference of the library interface:
506//!
507//! ```
508//! use apache_avro::{Codec, DeflateSettings, Reader, Schema, Writer, from_value, types::Record, Error};
509//! use serde::{Deserialize, Serialize};
510//!
511//! #[derive(Debug, Deserialize, Serialize)]
512//! struct Test {
513//! a: i64,
514//! b: String,
515//! }
516//!
517//! fn main() -> Result<(), Error> {
518//! let raw_schema = r#"
519//! {
520//! "type": "record",
521//! "name": "test",
522//! "fields": [
523//! {"name": "a", "type": "long", "default": 42},
524//! {"name": "b", "type": "string"}
525//! ]
526//! }
527//! "#;
528//!
529//! let schema = Schema::parse_str(raw_schema)?;
530//!
531//! println!("{:?}", schema);
532//!
533//! let mut writer = Writer::with_codec(&schema, Vec::new(), Codec::Deflate(DeflateSettings::default()));
534//!
535//! let mut record = Record::new(writer.schema()).unwrap();
536//! record.put("a", 27i64);
537//! record.put("b", "foo");
538//!
539//! writer.append(record)?;
540//!
541//! let test = Test {
542//! a: 27,
543//! b: "foo".to_owned(),
544//! };
545//!
546//! writer.append_ser(test)?;
547//!
548//! let input = writer.into_inner()?;
549//! let reader = Reader::with_schema(&schema, &input[..])?;
550//!
551//! for record in reader {
552//! println!("{:?}", from_value::<Test>(&record?));
553//! }
554//! Ok(())
555//! }
556//! ```
557//!
558//! `apache-avro` also supports the logical types listed in the [Avro specification](https://avro.apache.org/docs/current/specification/#logical-types):
559//!
560//! 1. `Decimal` using the [`num_bigint`](https://docs.rs/num-bigint/latest/num_bigint) crate
561//! 1. UUID using the [`uuid`](https://docs.rs/uuid/latest/uuid) crate
562//! 1. Date, Time (milli) as `i32` and Time (micro) as `i64`
563//! 1. Timestamp (milli and micro) as `i64`
564//! 1. Local timestamp (milli and micro) as `i64`
565//! 1. Duration as a custom type with `months`, `days` and `millis` accessor methods each of which returns an `i32`
566//!
567//! Note that the on-disk representation is identical to the underlying primitive/complex type.
568//!
569//! ### Read and write logical types
570//!
571//! ```rust
572//! use apache_avro::{
573//! types::Record, types::Value, Codec, Days, Decimal, DeflateSettings, Duration, Millis, Months, Reader, Schema,
574//! Writer, Error,
575//! };
576//! use num_bigint::ToBigInt;
577//!
578//! fn main() -> Result<(), Error> {
579//! let raw_schema = r#"
580//! {
581//! "type": "record",
582//! "name": "test",
583//! "fields": [
584//! {
585//! "name": "decimal_fixed",
586//! "type": {
587//! "type": "fixed",
588//! "size": 2,
589//! "name": "decimal"
590//! },
591//! "logicalType": "decimal",
592//! "precision": 4,
593//! "scale": 2
594//! },
595//! {
596//! "name": "decimal_var",
597//! "type": "bytes",
598//! "logicalType": "decimal",
599//! "precision": 10,
600//! "scale": 3
601//! },
602//! {
603//! "name": "uuid",
604//! "type": "string",
605//! "logicalType": "uuid"
606//! },
607//! {
608//! "name": "date",
609//! "type": "int",
610//! "logicalType": "date"
611//! },
612//! {
613//! "name": "time_millis",
614//! "type": "int",
615//! "logicalType": "time-millis"
616//! },
617//! {
618//! "name": "time_micros",
619//! "type": "long",
620//! "logicalType": "time-micros"
621//! },
622//! {
623//! "name": "timestamp_millis",
624//! "type": "long",
625//! "logicalType": "timestamp-millis"
626//! },
627//! {
628//! "name": "timestamp_micros",
629//! "type": "long",
630//! "logicalType": "timestamp-micros"
631//! },
632//! {
633//! "name": "local_timestamp_millis",
634//! "type": "long",
635//! "logicalType": "local-timestamp-millis"
636//! },
637//! {
638//! "name": "local_timestamp_micros",
639//! "type": "long",
640//! "logicalType": "local-timestamp-micros"
641//! },
642//! {
643//! "name": "duration",
644//! "type": {
645//! "type": "fixed",
646//! "size": 12,
647//! "name": "duration"
648//! },
649//! "logicalType": "duration"
650//! }
651//! ]
652//! }
653//! "#;
654//!
655//! let schema = Schema::parse_str(raw_schema)?;
656//!
657//! println!("{:?}", schema);
658//!
659//! let mut writer = Writer::with_codec(&schema, Vec::new(), Codec::Deflate(DeflateSettings::default()));
660//!
661//! let mut record = Record::new(writer.schema()).unwrap();
662//! record.put("decimal_fixed", Decimal::from(9936.to_bigint().unwrap().to_signed_bytes_be()));
663//! record.put("decimal_var", Decimal::from(((-32442).to_bigint().unwrap()).to_signed_bytes_be()));
664//! record.put("uuid", uuid::Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000").unwrap());
665//! record.put("date", Value::Date(1));
666//! record.put("time_millis", Value::TimeMillis(2));
667//! record.put("time_micros", Value::TimeMicros(3));
668//! record.put("timestamp_millis", Value::TimestampMillis(4));
669//! record.put("timestamp_micros", Value::TimestampMicros(5));
670//! record.put("timestamp_nanos", Value::TimestampNanos(6));
671//! record.put("local_timestamp_millis", Value::LocalTimestampMillis(4));
672//! record.put("local_timestamp_micros", Value::LocalTimestampMicros(5));
673//! record.put("local_timestamp_nanos", Value::LocalTimestampMicros(6));
674//! record.put("duration", Duration::new(Months::new(6), Days::new(7), Millis::new(8)));
675//!
676//! writer.append(record)?;
677//!
678//! let input = writer.into_inner()?;
679//! let reader = Reader::with_schema(&schema, &input[..])?;
680//!
681//! for record in reader {
682//! println!("{:?}", record?);
683//! }
684//! Ok(())
685//! }
686//! ```
687//!
688//! ## Calculate Avro schema fingerprint
689//!
690//! This library supports calculating the following fingerprints:
691//!
692//! - SHA-256
693//! - MD5
694//! - Rabin
695//!
696//! An example of fingerprinting for the supported fingerprints:
697//!
698//! ```rust
699//! use apache_avro::rabin::Rabin;
700//! use apache_avro::{Schema, Error};
701//! use md5::Md5;
702//! use sha2::Sha256;
703//!
704//! fn main() -> Result<(), Error> {
705//! let raw_schema = r#"
706//! {
707//! "type": "record",
708//! "name": "test",
709//! "fields": [
710//! {"name": "a", "type": "long", "default": 42},
711//! {"name": "b", "type": "string"}
712//! ]
713//! }
714//! "#;
715//! let schema = Schema::parse_str(raw_schema)?;
716//! println!("{}", schema.fingerprint::<Sha256>());
717//! println!("{}", schema.fingerprint::<Md5>());
718//! println!("{}", schema.fingerprint::<Rabin>());
719//! Ok(())
720//! }
721//! ```
722//!
723//! ## Ill-formed data
724//!
725//! In order to ease decoding, the Binary Encoding specification of Avro data
726//! requires some fields to have their length encoded alongside the data.
727//!
728//! If encoded data passed to a `Reader` has been ill-formed, it can happen that
729//! the bytes meant to contain the length of data are bogus and could result
730//! in extravagant memory allocation.
731//!
732//! To shield users from ill-formed data, `apache-avro` sets a limit (default: 512MB)
733//! to any allocation it will perform when decoding data.
734//!
735//! If you expect some of your data fields to be larger than this limit, be sure
736//! to make use of the `max_allocation_bytes` function before reading **any** data
737//! (we leverage Rust's [`std::sync::Once`](https://doc.rust-lang.org/std/sync/struct.Once.html)
738//! mechanism to initialize this value, if
739//! any call to decode is made before a call to `max_allocation_bytes`, the limit
740//! will be 512MB throughout the lifetime of the program).
741//!
742//!
743//! ```rust
744//! use apache_avro::max_allocation_bytes;
745//!
746//! max_allocation_bytes(2 * 1024 * 1024 * 1024); // 2GB
747//!
748//! // ... happily decode large data
749//!
750//! ```
751//!
752//! ## Check schemas compatibility
753//!
754//! This library supports checking for schemas compatibility.
755//!
756//! Examples of checking for compatibility:
757//!
758//! 1. Compatible schemas
759//!
760//! Explanation: an int array schema can be read by a long array schema- an int
761//! (32bit signed integer) fits into a long (64bit signed integer)
762//!
763//! ```rust
764//! use apache_avro::{Schema, schema_compatibility::SchemaCompatibility};
765//!
766//! let writers_schema = Schema::parse_str(r#"{"type": "array", "items":"int"}"#).unwrap();
767//! let readers_schema = Schema::parse_str(r#"{"type": "array", "items":"long"}"#).unwrap();
768//! assert!(SchemaCompatibility::can_read(&writers_schema, &readers_schema).is_ok());
769//! ```
770//!
771//! 2. Incompatible schemas (a long array schema cannot be read by an int array schema)
772//!
773//! Explanation: a long array schema cannot be read by an int array schema- a
774//! long (64bit signed integer) does not fit into an int (32bit signed integer)
775//!
776//! ```rust
777//! use apache_avro::{Schema, schema_compatibility::SchemaCompatibility};
778//!
779//! let writers_schema = Schema::parse_str(r#"{"type": "array", "items":"long"}"#).unwrap();
780//! let readers_schema = Schema::parse_str(r#"{"type": "array", "items":"int"}"#).unwrap();
781//! assert!(SchemaCompatibility::can_read(&writers_schema, &readers_schema).is_err());
782//! ```
783//! ## Custom names validators
784//!
785//! By default the library follows the rules by the
786//! [Avro specification](https://avro.apache.org/docs/1.11.1/specification/#names)!
787//!
788//! Some of the other Apache Avro language SDKs are not that strict and allow more
789//! characters in names. For interoperability with those SDKs, the library provides
790//! a way to customize the names validation.
791//!
792//! ```rust
793//! use apache_avro::AvroResult;
794//! use apache_avro::schema::Namespace;
795//! use apache_avro::validator::{SchemaNameValidator, set_schema_name_validator};
796//!
797//! struct MyCustomValidator;
798//!
799//! impl SchemaNameValidator for MyCustomValidator {
800//! fn validate(&self, name: &str) -> AvroResult<(String, Namespace)> {
801//! todo!()
802//! }
803//! }
804//!
805//! // don't parse any schema before registering the custom validator(s) !
806//!
807//! set_schema_name_validator(Box::new(MyCustomValidator));
808//!
809//! // ... use the library
810//! ```
811//!
812//! Similar logic could be applied to the schema namespace, enum symbols and field names validation.
813//!
814//! **Note**: the library allows to set a validator only once per the application lifetime!
815//! If the application parses schemas before setting a validator, the default validator will be
816//! registered and used!
817//!
818//! ## Custom schema equality comparators
819//!
820//! The library provides two implementations of schema equality comparators:
821//! 1. `SpecificationEq` - a comparator that serializes the schemas to their
822//! canonical forms (i.e. JSON) and compares them as strings. It is the only implementation
823//! until apache_avro 0.16.0.
824//! See the [Avro specification](https://avro.apache.org/docs/1.11.1/specification/#parsing-canonical-form-for-schemas)
825//! for more information!
826//! 2. `StructFieldEq` - a comparator that compares the schemas structurally.
827//! It is faster than the `SpecificationEq` because it returns `false` as soon as a difference
828//! is found and is recommended for use!
829//! It is the default comparator since apache_avro 0.17.0.
830//!
831//! To use a custom comparator, you need to implement the `SchemataEq` trait and set it using the
832//! `set_schemata_equality_comparator` function:
833//!
834//! ```rust
835//! use apache_avro::{AvroResult, Schema};
836//! use apache_avro::schema::Namespace;
837//! use apache_avro::schema_equality::{SchemataEq, set_schemata_equality_comparator};
838//!
839//! #[derive(Debug)]
840//! struct MyCustomSchemataEq;
841//!
842//! impl SchemataEq for MyCustomSchemataEq {
843//! fn compare(&self, schema_one: &Schema, schema_two: &Schema) -> bool {
844//! todo!()
845//! }
846//! }
847//!
848//! // don't parse any schema before registering the custom comparator !
849//!
850//! set_schemata_equality_comparator(Box::new(MyCustomSchemataEq));
851//!
852//! // ... use the library
853//! ```
854//! **Note**: the library allows to set a comparator only once per the application lifetime!
855//! If the application parses schemas before setting a comparator, the default comparator will be
856//! registered and used!
857//!
858
859mod bigdecimal;
860mod bytes;
861mod codec;
862mod de;
863mod decimal;
864mod decode;
865mod duration;
866mod encode;
867mod error;
868mod reader;
869mod ser;
870mod ser_schema;
871mod util;
872mod writer;
873
874pub mod rabin;
875pub mod schema;
876pub mod schema_compatibility;
877pub mod schema_equality;
878pub mod types;
879pub mod validator;
880
881pub use crate::{
882 bigdecimal::BigDecimal,
883 bytes::{
884 serde_avro_bytes, serde_avro_bytes_opt, serde_avro_fixed, serde_avro_fixed_opt,
885 serde_avro_slice, serde_avro_slice_opt,
886 },
887};
888#[cfg(feature = "bzip")]
889pub use codec::bzip::Bzip2Settings;
890#[cfg(feature = "xz")]
891pub use codec::xz::XzSettings;
892#[cfg(feature = "zstandard")]
893pub use codec::zstandard::ZstandardSettings;
894pub use codec::{Codec, DeflateSettings};
895pub use de::from_value;
896pub use decimal::Decimal;
897pub use duration::{Days, Duration, Millis, Months};
898pub use error::Error;
899pub use reader::{
900 from_avro_datum, from_avro_datum_reader_schemata, from_avro_datum_schemata, read_marker,
901 GenericSingleObjectReader, Reader, SpecificSingleObjectReader,
902};
903pub use schema::{AvroSchema, Schema};
904pub use ser::to_value;
905pub use util::{max_allocation_bytes, set_serde_human_readable};
906pub use uuid::Uuid;
907pub use writer::{
908 to_avro_datum, to_avro_datum_schemata, GenericSingleObjectWriter, SpecificSingleObjectWriter,
909 Writer,
910};
911
912#[cfg(feature = "derive")]
913pub use apache_avro_derive::*;
914
915/// A convenience type alias for `Result`s with `Error`s.
916pub type AvroResult<T> = Result<T, Error>;
917
918#[cfg(test)]
919mod tests {
920 use crate::{
921 from_avro_datum,
922 types::{Record, Value},
923 Codec, Reader, Schema, Writer,
924 };
925 use pretty_assertions::assert_eq;
926
927 //TODO: move where it fits better
928 #[test]
929 fn test_enum_default() {
930 let writer_raw_schema = r#"
931 {
932 "type": "record",
933 "name": "test",
934 "fields": [
935 {"name": "a", "type": "long", "default": 42},
936 {"name": "b", "type": "string"}
937 ]
938 }
939 "#;
940 let reader_raw_schema = r#"
941 {
942 "type": "record",
943 "name": "test",
944 "fields": [
945 {"name": "a", "type": "long", "default": 42},
946 {"name": "b", "type": "string"},
947 {
948 "name": "c",
949 "type": {
950 "type": "enum",
951 "name": "suit",
952 "symbols": ["diamonds", "spades", "clubs", "hearts"]
953 },
954 "default": "spades"
955 }
956 ]
957 }
958 "#;
959 let writer_schema = Schema::parse_str(writer_raw_schema).unwrap();
960 let reader_schema = Schema::parse_str(reader_raw_schema).unwrap();
961 let mut writer = Writer::with_codec(&writer_schema, Vec::new(), Codec::Null);
962 let mut record = Record::new(writer.schema()).unwrap();
963 record.put("a", 27i64);
964 record.put("b", "foo");
965 writer.append(record).unwrap();
966 let input = writer.into_inner().unwrap();
967 let mut reader = Reader::with_schema(&reader_schema, &input[..]).unwrap();
968 assert_eq!(
969 reader.next().unwrap().unwrap(),
970 Value::Record(vec![
971 ("a".to_string(), Value::Long(27)),
972 ("b".to_string(), Value::String("foo".to_string())),
973 ("c".to_string(), Value::Enum(1, "spades".to_string())),
974 ])
975 );
976 assert!(reader.next().is_none());
977 }
978
979 //TODO: move where it fits better
980 #[test]
981 fn test_enum_string_value() {
982 let raw_schema = r#"
983 {
984 "type": "record",
985 "name": "test",
986 "fields": [
987 {"name": "a", "type": "long", "default": 42},
988 {"name": "b", "type": "string"},
989 {
990 "name": "c",
991 "type": {
992 "type": "enum",
993 "name": "suit",
994 "symbols": ["diamonds", "spades", "clubs", "hearts"]
995 },
996 "default": "spades"
997 }
998 ]
999 }
1000 "#;
1001 let schema = Schema::parse_str(raw_schema).unwrap();
1002 let mut writer = Writer::with_codec(&schema, Vec::new(), Codec::Null);
1003 let mut record = Record::new(writer.schema()).unwrap();
1004 record.put("a", 27i64);
1005 record.put("b", "foo");
1006 record.put("c", "clubs");
1007 writer.append(record).unwrap();
1008 let input = writer.into_inner().unwrap();
1009 let mut reader = Reader::with_schema(&schema, &input[..]).unwrap();
1010 assert_eq!(
1011 reader.next().unwrap().unwrap(),
1012 Value::Record(vec![
1013 ("a".to_string(), Value::Long(27)),
1014 ("b".to_string(), Value::String("foo".to_string())),
1015 ("c".to_string(), Value::Enum(2, "clubs".to_string())),
1016 ])
1017 );
1018 assert!(reader.next().is_none());
1019 }
1020
1021 //TODO: move where it fits better
1022 #[test]
1023 fn test_enum_no_reader_schema() {
1024 let writer_raw_schema = r#"
1025 {
1026 "type": "record",
1027 "name": "test",
1028 "fields": [
1029 {"name": "a", "type": "long", "default": 42},
1030 {"name": "b", "type": "string"},
1031 {
1032 "name": "c",
1033 "type": {
1034 "type": "enum",
1035 "name": "suit",
1036 "symbols": ["diamonds", "spades", "clubs", "hearts"]
1037 },
1038 "default": "spades"
1039 }
1040 ]
1041 }
1042 "#;
1043 let writer_schema = Schema::parse_str(writer_raw_schema).unwrap();
1044 let mut writer = Writer::with_codec(&writer_schema, Vec::new(), Codec::Null);
1045 let mut record = Record::new(writer.schema()).unwrap();
1046 record.put("a", 27i64);
1047 record.put("b", "foo");
1048 record.put("c", "clubs");
1049 writer.append(record).unwrap();
1050 let input = writer.into_inner().unwrap();
1051 let mut reader = Reader::new(&input[..]).unwrap();
1052 assert_eq!(
1053 reader.next().unwrap().unwrap(),
1054 Value::Record(vec![
1055 ("a".to_string(), Value::Long(27)),
1056 ("b".to_string(), Value::String("foo".to_string())),
1057 ("c".to_string(), Value::Enum(2, "clubs".to_string())),
1058 ])
1059 );
1060 }
1061
1062 #[test]
1063 fn test_illformed_length() {
1064 let raw_schema = r#"
1065 {
1066 "type": "record",
1067 "name": "test",
1068 "fields": [
1069 {"name": "a", "type": "long", "default": 42},
1070 {"name": "b", "type": "string"}
1071 ]
1072 }
1073 "#;
1074
1075 let schema = Schema::parse_str(raw_schema).unwrap();
1076
1077 // Would allocated 18446744073709551605 bytes
1078 let illformed: &[u8] = &[0x3e, 0x15, 0xff, 0x1f, 0x15, 0xff];
1079
1080 let value = from_avro_datum(&schema, &mut &*illformed, None);
1081 assert!(value.is_err());
1082 }
1083}