apache_avro/lib.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! [![rust continuous integration][rust continuous integration img]][rust continuous integration]
19//! [![rust clippy check][rust clippy check img]][rust clippy check]
20//! [![rust security audit][rust security audit img]][rust security audit]
21//!
22//! [rust continuous integration]: https://github.com/apache/avro-rs/actions/workflows/test-lang-rust-ci.yml
23//! [rust clippy check]: https://github.com/apache/avro-rs/actions/workflows/test-lang-rust-clippy.yml
24//! [rust security audit]: https://github.com/apache/avro-rs/actions/workflows/test-lang-rust-audit.yml
25//!
26//! [rust continuous integration img]: https://github.com/apache/avro-rs/actions/workflows/test-lang-rust-ci.yml/badge.svg
27//! [rust clippy check img]: https://github.com/apache/avro-rs/actions/workflows/test-lang-rust-clippy.yml/badge.svg
28//! [rust security audit img]: https://github.com/apache/avro-rs/actions/workflows/test-lang-rust-audit.yml/badge.svg
29//!
30//! A library for working with [Apache Avro](https://avro.apache.org/) in Rust.
31//!
32//! Please check our [documentation](https://docs.rs/apache-avro) for examples, tutorials and API reference.
33//!
34//! **[Apache Avro](https://avro.apache.org/)** is a data serialization system which provides rich
35//! data structures and a compact, fast, binary data format.
36//!
37//! All data in Avro is schematized, as in the following example:
38//!
39//! ```json
40//! {
41//! "type": "record",
42//! "name": "test",
43//! "fields": [
44//! {"name": "a", "type": "long", "default": 42},
45//! {"name": "b", "type": "string"}
46//! ]
47//! }
48//! ```
49//!
50//! There are basically two ways of handling Avro data in Rust:
51//!
52//! * **as Avro-specialized data types** based on an Avro schema;
53//! * **as generic Rust serde-compatible types** implementing/deriving `Serialize` and `Deserialize`;
54//!
55//! **apache-avro** provides a way to read and write both these data representations easily and
56//! efficiently.
57//!
58//! # Installing the library
59//!
60//!
61//! Add to your `Cargo.toml`:
62//!
63//! ```toml
64//! [dependencies]
65//! apache-avro = "x.y"
66//! ```
67//!
68//! Or in case you want to leverage the **Snappy** codec:
69//!
70//! ```toml
71//! [dependencies.apache-avro]
72//! version = "x.y"
73//! features = ["snappy"]
74//! ```
75//!
76//! Or in case you want to leverage the **Zstandard** codec:
77//!
78//! ```toml
79//! [dependencies.apache-avro]
80//! version = "x.y"
81//! features = ["zstandard"]
82//! ```
83//!
84//! Or in case you want to leverage the **Bzip2** codec:
85//!
86//! ```toml
87//! [dependencies.apache-avro]
88//! version = "x.y"
89//! features = ["bzip"]
90//! ```
91//!
92//! Or in case you want to leverage the **Xz** codec:
93//!
94//! ```toml
95//! [dependencies.apache-avro]
96//! version = "x.y"
97//! features = ["xz"]
98//! ```
99//!
100//! # Upgrading to a newer minor version
101//!
102//! The library is still in beta, so there might be backward-incompatible changes between minor
103//! versions. If you have troubles upgrading, check the release notes.
104//!
105//! # Minimum supported Rust version
106//!
107//! 1.85.0
108//!
109//! # Defining a schema
110//!
111//! An Avro data cannot exist without an Avro schema. Schemas **must** be used while writing and
112//! **can** be used while reading and they carry the information regarding the type of data we are
113//! handling. Avro schemas are used for both schema validation and resolution of Avro data.
114//!
115//! Avro schemas are defined in **JSON** format and can just be parsed out of a raw string:
116//!
117//! ```
118//! use apache_avro::Schema;
119//!
120//! let raw_schema = r#"
121//! {
122//! "type": "record",
123//! "name": "test",
124//! "fields": [
125//! {"name": "a", "type": "long", "default": 42},
126//! {"name": "b", "type": "string"}
127//! ]
128//! }
129//! "#;
130//!
131//! // if the schema is not valid, this function will return an error
132//! let schema = Schema::parse_str(raw_schema).unwrap();
133//!
134//! // schemas can be printed for debugging
135//! println!("{:?}", schema);
136//! ```
137//!
138//! Additionally, a list of of definitions (which may depend on each other) can be given and all of
139//! them will be parsed into the corresponding schemas.
140//!
141//! ```
142//! use apache_avro::Schema;
143//!
144//! let raw_schema_1 = r#"{
145//! "name": "A",
146//! "type": "record",
147//! "fields": [
148//! {"name": "field_one", "type": "float"}
149//! ]
150//! }"#;
151//!
152//! // This definition depends on the definition of A above
153//! let raw_schema_2 = r#"{
154//! "name": "B",
155//! "type": "record",
156//! "fields": [
157//! {"name": "field_one", "type": "A"}
158//! ]
159//! }"#;
160//!
161//! // if the schemas are not valid, this function will return an error
162//! let schemas = Schema::parse_list(&[raw_schema_1, raw_schema_2]).unwrap();
163//!
164//! // schemas can be printed for debugging
165//! println!("{:?}", schemas);
166//! ```
167//! *N.B.* It is important to note that the composition of schema definitions requires schemas with names.
168//! For this reason, only schemas of type Record, Enum, and Fixed should be input into this function.
169//!
170//! The library provides also a programmatic interface to define schemas without encoding them in
171//! JSON (for advanced use), but we highly recommend the JSON interface. Please read the API
172//! reference in case you are interested.
173//!
174//! For more information about schemas and what kind of information you can encapsulate in them,
175//! please refer to the appropriate section of the
176//! [Avro Specification](https://avro.apache.org/docs/current/specification/#schema-declaration).
177//!
178//! # Writing data
179//!
180//! Once we have defined a schema, we are ready to serialize data in Avro, validating them against
181//! the provided schema in the process. As mentioned before, there are two ways of handling Avro
182//! data in Rust.
183//!
184//! **NOTE:** The library also provides a low-level interface for encoding a single datum in Avro
185//! bytecode without generating markers and headers (for advanced use), but we highly recommend the
186//! `Writer` interface to be totally Avro-compatible. Please read the API reference in case you are
187//! interested.
188//!
189//! ## The avro way
190//!
191//! Given that the schema we defined above is that of an Avro *Record*, we are going to use the
192//! associated type provided by the library to specify the data we want to serialize:
193//!
194//! ```
195//! # use apache_avro::Schema;
196//! use apache_avro::types::Record;
197//! use apache_avro::Writer;
198//! #
199//! # let raw_schema = r#"
200//! # {
201//! # "type": "record",
202//! # "name": "test",
203//! # "fields": [
204//! # {"name": "a", "type": "long", "default": 42},
205//! # {"name": "b", "type": "string"}
206//! # ]
207//! # }
208//! # "#;
209//! # let schema = Schema::parse_str(raw_schema).unwrap();
210//! // a writer needs a schema and something to write to
211//! let mut writer = Writer::new(&schema, Vec::new());
212//!
213//! // the Record type models our Record schema
214//! let mut record = Record::new(writer.schema()).unwrap();
215//! record.put("a", 27i64);
216//! record.put("b", "foo");
217//!
218//! // schema validation happens here
219//! writer.append(record).unwrap();
220//!
221//! // this is how to get back the resulting avro bytecode
222//! // this performs a flush operation to make sure data has been written, so it can fail
223//! // you can also call `writer.flush()` yourself without consuming the writer
224//! let encoded = writer.into_inner().unwrap();
225//! ```
226//!
227//! The vast majority of the times, schemas tend to define a record as a top-level container
228//! encapsulating all the values to convert as fields and providing documentation for them, but in
229//! case we want to directly define an Avro value, the library offers that capability via the
230//! `Value` interface.
231//!
232//! ```
233//! use apache_avro::types::Value;
234//!
235//! let mut value = Value::String("foo".to_string());
236//! ```
237//!
238//! ## The serde way
239//!
240//! Given that the schema we defined above is an Avro *Record*, we can directly use a Rust struct
241//! deriving `Serialize` to model our data:
242//!
243//! ```
244//! # use apache_avro::Schema;
245//! # use serde::Serialize;
246//! use apache_avro::Writer;
247//!
248//! #[derive(Debug, Serialize)]
249//! struct Test {
250//! a: i64,
251//! b: String,
252//! }
253//!
254//! # let raw_schema = r#"
255//! # {
256//! # "type": "record",
257//! # "name": "test",
258//! # "fields": [
259//! # {"name": "a", "type": "long", "default": 42},
260//! # {"name": "b", "type": "string"}
261//! # ]
262//! # }
263//! # "#;
264//! # let schema = Schema::parse_str(raw_schema).unwrap();
265//! // a writer needs a schema and something to write to
266//! let mut writer = Writer::new(&schema, Vec::new());
267//!
268//! // the structure models our Record schema
269//! let test = Test {
270//! a: 27,
271//! b: "foo".to_owned(),
272//! };
273//!
274//! // schema validation happens here
275//! writer.append_ser(test).unwrap();
276//!
277//! // this is how to get back the resulting avro bytecode
278//! // this performs a flush operation to make sure data is written, so it can fail
279//! // you can also call `writer.flush()` yourself without consuming the writer
280//! let encoded = writer.into_inner();
281//! ```
282//!
283//! ### Importance of the fields' order
284//!
285//! *Important*: The order of the fields in the struct must match the order of the fields in the Avro schema!
286//!
287//! ### Simple types
288//!
289//! The vast majority of the times, schemas tend to define a record as a top-level container
290//! encapsulating all the values to convert as fields and providing documentation for them, but in
291//! case we want to directly define an Avro value, any type implementing `Serialize` should work.
292//!
293//! ```
294//! let value = "foo".to_string();
295//! ```
296//!
297//! ## Using codecs to compress data
298//!
299//! Avro supports three different compression codecs when encoding data:
300//!
301//! * **Null**: leaves data uncompressed;
302//! * **Deflate**: writes the data block using the deflate algorithm as specified in RFC 1951, and
303//! typically implemented using the zlib library. Note that this format (unlike the "zlib format" in
304//! RFC 1950) does not have a checksum.
305//! * **Snappy**: uses Google's [Snappy](http://google.github.io/snappy/) compression library. Each
306//! compressed block is followed by the 4-byte, big-endianCRC32 checksum of the uncompressed data in
307//! the block. You must enable the `snappy` feature to use this codec.
308//! * **Zstandard**: uses Facebook's [Zstandard](https://facebook.github.io/zstd/) compression library.
309//! You must enable the `zstandard` feature to use this codec.
310//! * **Bzip2**: uses [BZip2](https://sourceware.org/bzip2/) compression library.
311//! You must enable the `bzip` feature to use this codec.
312//! * **Xz**: uses [xz2](https://github.com/alexcrichton/xz2-rs) compression library.
313//! You must enable the `xz` feature to use this codec.
314//!
315//! To specify a codec to use to compress data, just specify it while creating a `Writer`:
316//! ```
317//! use apache_avro::{Codec, DeflateSettings, Schema, Writer};
318//! #
319//! # let raw_schema = r#"
320//! # {
321//! # "type": "record",
322//! # "name": "test",
323//! # "fields": [
324//! # {"name": "a", "type": "long", "default": 42},
325//! # {"name": "b", "type": "string"}
326//! # ]
327//! # }
328//! # "#;
329//! # let schema = Schema::parse_str(raw_schema).unwrap();
330//! let mut writer = Writer::with_codec(&schema, Vec::new(), Codec::Deflate(DeflateSettings::default()));
331//! ```
332//!
333//! # Reading data
334//!
335//! As far as reading Avro encoded data goes, we can just use the schema encoded with the data to
336//! read them. The library will do it automatically for us, as it already does for the compression
337//! codec:
338//!
339//! ```
340//! use apache_avro::Reader;
341//! # use apache_avro::Schema;
342//! # use apache_avro::types::Record;
343//! # use apache_avro::Writer;
344//! #
345//! # let raw_schema = r#"
346//! # {
347//! # "type": "record",
348//! # "name": "test",
349//! # "fields": [
350//! # {"name": "a", "type": "long", "default": 42},
351//! # {"name": "b", "type": "string"}
352//! # ]
353//! # }
354//! # "#;
355//! # let schema = Schema::parse_str(raw_schema).unwrap();
356//! # let mut writer = Writer::new(&schema, Vec::new());
357//! # let mut record = Record::new(writer.schema()).unwrap();
358//! # record.put("a", 27i64);
359//! # record.put("b", "foo");
360//! # writer.append(record).unwrap();
361//! # let input = writer.into_inner().unwrap();
362//! // reader creation can fail in case the input to read from is not Avro-compatible or malformed
363//! let reader = Reader::new(&input[..]).unwrap();
364//! ```
365//!
366//! In case, instead, we want to specify a different (but compatible) reader schema from the schema
367//! the data has been written with, we can just do as the following:
368//! ```
369//! use apache_avro::Schema;
370//! use apache_avro::Reader;
371//! # use apache_avro::types::Record;
372//! # use apache_avro::Writer;
373//! #
374//! # let writer_raw_schema = r#"
375//! # {
376//! # "type": "record",
377//! # "name": "test",
378//! # "fields": [
379//! # {"name": "a", "type": "long", "default": 42},
380//! # {"name": "b", "type": "string"}
381//! # ]
382//! # }
383//! # "#;
384//! # let writer_schema = Schema::parse_str(writer_raw_schema).unwrap();
385//! # let mut writer = Writer::new(&writer_schema, Vec::new());
386//! # let mut record = Record::new(writer.schema()).unwrap();
387//! # record.put("a", 27i64);
388//! # record.put("b", "foo");
389//! # writer.append(record).unwrap();
390//! # let input = writer.into_inner().unwrap();
391//!
392//! let reader_raw_schema = r#"
393//! {
394//! "type": "record",
395//! "name": "test",
396//! "fields": [
397//! {"name": "a", "type": "long", "default": 42},
398//! {"name": "b", "type": "string"},
399//! {"name": "c", "type": "long", "default": 43}
400//! ]
401//! }
402//! "#;
403//!
404//! let reader_schema = Schema::parse_str(reader_raw_schema).unwrap();
405//!
406//! // reader creation can fail in case the input to read from is not Avro-compatible or malformed
407//! let reader = Reader::with_schema(&reader_schema, &input[..]).unwrap();
408//! ```
409//!
410//! The library will also automatically perform schema resolution while reading the data.
411//!
412//! For more information about schema compatibility and resolution, please refer to the
413//! [Avro Specification](https://avro.apache.org/docs/current/specification/#schema-declaration).
414//!
415//! As usual, there are two ways to handle Avro data in Rust, as you can see below.
416//!
417//! **NOTE:** The library also provides a low-level interface for decoding a single datum in Avro
418//! bytecode without markers and header (for advanced use), but we highly recommend the `Reader`
419//! interface to leverage all Avro features. Please read the API reference in case you are
420//! interested.
421//!
422//!
423//! ## The avro way
424//!
425//! We can just read directly instances of `Value` out of the `Reader` iterator:
426//!
427//! ```
428//! # use apache_avro::Schema;
429//! # use apache_avro::types::Record;
430//! # use apache_avro::Writer;
431//! use apache_avro::Reader;
432//! #
433//! # let raw_schema = r#"
434//! # {
435//! # "type": "record",
436//! # "name": "test",
437//! # "fields": [
438//! # {"name": "a", "type": "long", "default": 42},
439//! # {"name": "b", "type": "string"}
440//! # ]
441//! # }
442//! # "#;
443//! # let schema = Schema::parse_str(raw_schema).unwrap();
444//! # let schema = Schema::parse_str(raw_schema).unwrap();
445//! # let mut writer = Writer::new(&schema, Vec::new());
446//! # let mut record = Record::new(writer.schema()).unwrap();
447//! # record.put("a", 27i64);
448//! # record.put("b", "foo");
449//! # writer.append(record).unwrap();
450//! # let input = writer.into_inner().unwrap();
451//! let reader = Reader::new(&input[..]).unwrap();
452//!
453//! // value is a Result of an Avro Value in case the read operation fails
454//! for value in reader {
455//! println!("{:?}", value.unwrap());
456//! }
457//!
458//! ```
459//!
460//! ## The serde way
461//!
462//! Alternatively, we can use a Rust type implementing `Deserialize` and representing our schema to
463//! read the data into:
464//!
465//! ```
466//! # use apache_avro::Schema;
467//! # use apache_avro::Writer;
468//! # use serde::{Deserialize, Serialize};
469//! use apache_avro::Reader;
470//! use apache_avro::from_value;
471//!
472//! # #[derive(Serialize)]
473//! #[derive(Debug, Deserialize)]
474//! struct Test {
475//! a: i64,
476//! b: String,
477//! }
478//!
479//! # let raw_schema = r#"
480//! # {
481//! # "type": "record",
482//! # "name": "test",
483//! # "fields": [
484//! # {"name": "a", "type": "long", "default": 42},
485//! # {"name": "b", "type": "string"}
486//! # ]
487//! # }
488//! # "#;
489//! # let schema = Schema::parse_str(raw_schema).unwrap();
490//! # let mut writer = Writer::new(&schema, Vec::new());
491//! # let test = Test {
492//! # a: 27,
493//! # b: "foo".to_owned(),
494//! # };
495//! # writer.append_ser(test).unwrap();
496//! # let input = writer.into_inner().unwrap();
497//! let reader = Reader::new(&input[..]).unwrap();
498//!
499//! // value is a Result in case the read operation fails
500//! for value in reader {
501//! println!("{:?}", from_value::<Test>(&value.unwrap()));
502//! }
503//! ```
504//!
505//! # Putting everything together
506//!
507//! The following is an example of how to combine everything showed so far and it is meant to be a
508//! quick reference of the library interface:
509//!
510//! ```
511//! use apache_avro::{Codec, DeflateSettings, Reader, Schema, Writer, from_value, types::Record, Error};
512//! use serde::{Deserialize, Serialize};
513//!
514//! #[derive(Debug, Deserialize, Serialize)]
515//! struct Test {
516//! a: i64,
517//! b: String,
518//! }
519//!
520//! fn main() -> Result<(), Error> {
521//! let raw_schema = r#"
522//! {
523//! "type": "record",
524//! "name": "test",
525//! "fields": [
526//! {"name": "a", "type": "long", "default": 42},
527//! {"name": "b", "type": "string"}
528//! ]
529//! }
530//! "#;
531//!
532//! let schema = Schema::parse_str(raw_schema)?;
533//!
534//! println!("{:?}", schema);
535//!
536//! let mut writer = Writer::with_codec(&schema, Vec::new(), Codec::Deflate(DeflateSettings::default()));
537//!
538//! let mut record = Record::new(writer.schema()).unwrap();
539//! record.put("a", 27i64);
540//! record.put("b", "foo");
541//!
542//! writer.append(record)?;
543//!
544//! let test = Test {
545//! a: 27,
546//! b: "foo".to_owned(),
547//! };
548//!
549//! writer.append_ser(test)?;
550//!
551//! let input = writer.into_inner()?;
552//! let reader = Reader::with_schema(&schema, &input[..])?;
553//!
554//! for record in reader {
555//! println!("{:?}", from_value::<Test>(&record?));
556//! }
557//! Ok(())
558//! }
559//! ```
560//!
561//! `apache-avro` also supports the logical types listed in the [Avro specification](https://avro.apache.org/docs/current/specification/#logical-types):
562//!
563//! 1. `Decimal` using the [`num_bigint`](https://docs.rs/num-bigint/latest/num_bigint) crate
564//! 1. UUID using the [`uuid`](https://docs.rs/uuid/latest/uuid) crate
565//! 1. Date, Time (milli) as `i32` and Time (micro) as `i64`
566//! 1. Timestamp (milli and micro) as `i64`
567//! 1. Local timestamp (milli and micro) as `i64`
568//! 1. Duration as a custom type with `months`, `days` and `millis` accessor methods each of which returns an `i32`
569//!
570//! Note that the on-disk representation is identical to the underlying primitive/complex type.
571//!
572//! ### Read and write logical types
573//!
574//! ```rust
575//! use apache_avro::{
576//! types::Record, types::Value, Codec, Days, Decimal, DeflateSettings, Duration, Millis, Months, Reader, Schema,
577//! Writer, Error,
578//! };
579//! use num_bigint::ToBigInt;
580//!
581//! fn main() -> Result<(), Error> {
582//! let raw_schema = r#"
583//! {
584//! "type": "record",
585//! "name": "test",
586//! "fields": [
587//! {
588//! "name": "decimal_fixed",
589//! "type": {
590//! "type": "fixed",
591//! "size": 2,
592//! "name": "decimal"
593//! },
594//! "logicalType": "decimal",
595//! "precision": 4,
596//! "scale": 2
597//! },
598//! {
599//! "name": "decimal_var",
600//! "type": "bytes",
601//! "logicalType": "decimal",
602//! "precision": 10,
603//! "scale": 3
604//! },
605//! {
606//! "name": "uuid",
607//! "type": "string",
608//! "logicalType": "uuid"
609//! },
610//! {
611//! "name": "date",
612//! "type": "int",
613//! "logicalType": "date"
614//! },
615//! {
616//! "name": "time_millis",
617//! "type": "int",
618//! "logicalType": "time-millis"
619//! },
620//! {
621//! "name": "time_micros",
622//! "type": "long",
623//! "logicalType": "time-micros"
624//! },
625//! {
626//! "name": "timestamp_millis",
627//! "type": "long",
628//! "logicalType": "timestamp-millis"
629//! },
630//! {
631//! "name": "timestamp_micros",
632//! "type": "long",
633//! "logicalType": "timestamp-micros"
634//! },
635//! {
636//! "name": "local_timestamp_millis",
637//! "type": "long",
638//! "logicalType": "local-timestamp-millis"
639//! },
640//! {
641//! "name": "local_timestamp_micros",
642//! "type": "long",
643//! "logicalType": "local-timestamp-micros"
644//! },
645//! {
646//! "name": "duration",
647//! "type": {
648//! "type": "fixed",
649//! "size": 12,
650//! "name": "duration"
651//! },
652//! "logicalType": "duration"
653//! }
654//! ]
655//! }
656//! "#;
657//!
658//! let schema = Schema::parse_str(raw_schema)?;
659//!
660//! println!("{:?}", schema);
661//!
662//! let mut writer = Writer::with_codec(&schema, Vec::new(), Codec::Deflate(DeflateSettings::default()));
663//!
664//! let mut record = Record::new(writer.schema()).unwrap();
665//! record.put("decimal_fixed", Decimal::from(9936.to_bigint().unwrap().to_signed_bytes_be()));
666//! record.put("decimal_var", Decimal::from(((-32442).to_bigint().unwrap()).to_signed_bytes_be()));
667//! record.put("uuid", uuid::Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000").unwrap());
668//! record.put("date", Value::Date(1));
669//! record.put("time_millis", Value::TimeMillis(2));
670//! record.put("time_micros", Value::TimeMicros(3));
671//! record.put("timestamp_millis", Value::TimestampMillis(4));
672//! record.put("timestamp_micros", Value::TimestampMicros(5));
673//! record.put("timestamp_nanos", Value::TimestampNanos(6));
674//! record.put("local_timestamp_millis", Value::LocalTimestampMillis(4));
675//! record.put("local_timestamp_micros", Value::LocalTimestampMicros(5));
676//! record.put("local_timestamp_nanos", Value::LocalTimestampMicros(6));
677//! record.put("duration", Duration::new(Months::new(6), Days::new(7), Millis::new(8)));
678//!
679//! writer.append(record)?;
680//!
681//! let input = writer.into_inner()?;
682//! let reader = Reader::with_schema(&schema, &input[..])?;
683//!
684//! for record in reader {
685//! println!("{:?}", record?);
686//! }
687//! Ok(())
688//! }
689//! ```
690//!
691//! ## Calculate Avro schema fingerprint
692//!
693//! This library supports calculating the following fingerprints:
694//!
695//! - SHA-256
696//! - MD5
697//! - Rabin
698//!
699//! An example of fingerprinting for the supported fingerprints:
700//!
701//! ```rust
702//! use apache_avro::rabin::Rabin;
703//! use apache_avro::{Schema, Error};
704//! use md5::Md5;
705//! use sha2::Sha256;
706//!
707//! fn main() -> Result<(), Error> {
708//! let raw_schema = r#"
709//! {
710//! "type": "record",
711//! "name": "test",
712//! "fields": [
713//! {"name": "a", "type": "long", "default": 42},
714//! {"name": "b", "type": "string"}
715//! ]
716//! }
717//! "#;
718//! let schema = Schema::parse_str(raw_schema)?;
719//! println!("{}", schema.fingerprint::<Sha256>());
720//! println!("{}", schema.fingerprint::<Md5>());
721//! println!("{}", schema.fingerprint::<Rabin>());
722//! Ok(())
723//! }
724//! ```
725//!
726//! ## Ill-formed data
727//!
728//! In order to ease decoding, the Binary Encoding specification of Avro data
729//! requires some fields to have their length encoded alongside the data.
730//!
731//! If encoded data passed to a `Reader` has been ill-formed, it can happen that
732//! the bytes meant to contain the length of data are bogus and could result
733//! in extravagant memory allocation.
734//!
735//! To shield users from ill-formed data, `apache-avro` sets a limit (default: 512MB)
736//! to any allocation it will perform when decoding data.
737//!
738//! If you expect some of your data fields to be larger than this limit, be sure
739//! to make use of the `max_allocation_bytes` function before reading **any** data
740//! (we leverage Rust's [`std::sync::Once`](https://doc.rust-lang.org/std/sync/struct.Once.html)
741//! mechanism to initialize this value, if
742//! any call to decode is made before a call to `max_allocation_bytes`, the limit
743//! will be 512MB throughout the lifetime of the program).
744//!
745//!
746//! ```rust
747//! use apache_avro::max_allocation_bytes;
748//!
749//! max_allocation_bytes(2 * 1024 * 1024 * 1024); // 2GB
750//!
751//! // ... happily decode large data
752//!
753//! ```
754//!
755//! ## Check schemas compatibility
756//!
757//! This library supports checking for schemas compatibility.
758//!
759//! Examples of checking for compatibility:
760//!
761//! 1. Compatible schemas
762//!
763//! Explanation: an int array schema can be read by a long array schema- an int
764//! (32bit signed integer) fits into a long (64bit signed integer)
765//!
766//! ```rust
767//! use apache_avro::{Schema, schema_compatibility::SchemaCompatibility};
768//!
769//! let writers_schema = Schema::parse_str(r#"{"type": "array", "items":"int"}"#).unwrap();
770//! let readers_schema = Schema::parse_str(r#"{"type": "array", "items":"long"}"#).unwrap();
771//! assert!(SchemaCompatibility::can_read(&writers_schema, &readers_schema).is_ok());
772//! ```
773//!
774//! 2. Incompatible schemas (a long array schema cannot be read by an int array schema)
775//!
776//! Explanation: a long array schema cannot be read by an int array schema- a
777//! long (64bit signed integer) does not fit into an int (32bit signed integer)
778//!
779//! ```rust
780//! use apache_avro::{Schema, schema_compatibility::SchemaCompatibility};
781//!
782//! let writers_schema = Schema::parse_str(r#"{"type": "array", "items":"long"}"#).unwrap();
783//! let readers_schema = Schema::parse_str(r#"{"type": "array", "items":"int"}"#).unwrap();
784//! assert!(SchemaCompatibility::can_read(&writers_schema, &readers_schema).is_err());
785//! ```
786//! ## Custom names validators
787//!
788//! By default the library follows the rules by the
789//! [Avro specification](https://avro.apache.org/docs/1.11.1/specification/#names)!
790//!
791//! Some of the other Apache Avro language SDKs are not that strict and allow more
792//! characters in names. For interoperability with those SDKs, the library provides
793//! a way to customize the names validation.
794//!
795//! ```rust
796//! use apache_avro::AvroResult;
797//! use apache_avro::schema::Namespace;
798//! use apache_avro::validator::{SchemaNameValidator, set_schema_name_validator};
799//!
800//! struct MyCustomValidator;
801//!
802//! impl SchemaNameValidator for MyCustomValidator {
803//! fn validate(&self, name: &str) -> AvroResult<(String, Namespace)> {
804//! todo!()
805//! }
806//! }
807//!
808//! // don't parse any schema before registering the custom validator(s) !
809//!
810//! set_schema_name_validator(Box::new(MyCustomValidator));
811//!
812//! // ... use the library
813//! ```
814//!
815//! Similar logic could be applied to the schema namespace, enum symbols and field names validation.
816//!
817//! **Note**: the library allows to set a validator only once per the application lifetime!
818//! If the application parses schemas before setting a validator, the default validator will be
819//! registered and used!
820//!
821//! ## Custom schema equality comparators
822//!
823//! The library provides two implementations of schema equality comparators:
824//! 1. `SpecificationEq` - a comparator that serializes the schemas to their
825//! canonical forms (i.e. JSON) and compares them as strings. It is the only implementation
826//! until apache_avro 0.16.0.
827//! See the [Avro specification](https://avro.apache.org/docs/1.11.1/specification/#parsing-canonical-form-for-schemas)
828//! for more information!
829//! 2. `StructFieldEq` - a comparator that compares the schemas structurally.
830//! It is faster than the `SpecificationEq` because it returns `false` as soon as a difference
831//! is found and is recommended for use!
832//! It is the default comparator since apache_avro 0.17.0.
833//!
834//! To use a custom comparator, you need to implement the `SchemataEq` trait and set it using the
835//! `set_schemata_equality_comparator` function:
836//!
837//! ```rust
838//! use apache_avro::{AvroResult, Schema};
839//! use apache_avro::schema::Namespace;
840//! use apache_avro::schema_equality::{SchemataEq, set_schemata_equality_comparator};
841//!
842//! #[derive(Debug)]
843//! struct MyCustomSchemataEq;
844//!
845//! impl SchemataEq for MyCustomSchemataEq {
846//! fn compare(&self, schema_one: &Schema, schema_two: &Schema) -> bool {
847//! todo!()
848//! }
849//! }
850//!
851//! // don't parse any schema before registering the custom comparator !
852//!
853//! set_schemata_equality_comparator(Box::new(MyCustomSchemataEq));
854//!
855//! // ... use the library
856//! ```
857//! **Note**: the library allows to set a comparator only once per the application lifetime!
858//! If the application parses schemas before setting a comparator, the default comparator will be
859//! registered and used!
860//!
861
862mod bigdecimal;
863mod bytes;
864mod codec;
865mod de;
866mod decimal;
867mod decode;
868mod duration;
869mod encode;
870mod reader;
871mod ser;
872mod ser_schema;
873mod util;
874mod writer;
875
876pub mod error;
877pub mod headers;
878pub mod rabin;
879pub mod schema;
880pub mod schema_compatibility;
881pub mod schema_equality;
882pub mod types;
883pub mod validator;
884
885pub use crate::{
886 bigdecimal::BigDecimal,
887 bytes::{
888 serde_avro_bytes, serde_avro_bytes_opt, serde_avro_fixed, serde_avro_fixed_opt,
889 serde_avro_slice, serde_avro_slice_opt,
890 },
891};
892#[cfg(feature = "bzip")]
893pub use codec::bzip::Bzip2Settings;
894#[cfg(feature = "xz")]
895pub use codec::xz::XzSettings;
896#[cfg(feature = "zstandard")]
897pub use codec::zstandard::ZstandardSettings;
898pub use codec::{Codec, DeflateSettings};
899pub use de::from_value;
900pub use decimal::Decimal;
901pub use duration::{Days, Duration, Millis, Months};
902pub use error::Error;
903pub use reader::{
904 GenericSingleObjectReader, Reader, SpecificSingleObjectReader, from_avro_datum,
905 from_avro_datum_reader_schemata, from_avro_datum_schemata, read_marker,
906};
907pub use schema::{AvroSchema, Schema};
908pub use ser::to_value;
909pub use util::{max_allocation_bytes, set_serde_human_readable};
910pub use uuid::Uuid;
911pub use writer::{
912 GenericSingleObjectWriter, SpecificSingleObjectWriter, Writer, WriterBuilder, to_avro_datum,
913 to_avro_datum_schemata, write_avro_datum_ref,
914};
915
916#[cfg(feature = "derive")]
917pub use apache_avro_derive::*;
918
919/// A convenience type alias for `Result`s with `Error`s.
920pub type AvroResult<T> = Result<T, Error>;
921
922#[cfg(test)]
923mod tests {
924 use crate::{
925 Codec, Reader, Schema, Writer, from_avro_datum,
926 types::{Record, Value},
927 };
928 use pretty_assertions::assert_eq;
929
930 //TODO: move where it fits better
931 #[test]
932 fn test_enum_default() {
933 let writer_raw_schema = r#"
934 {
935 "type": "record",
936 "name": "test",
937 "fields": [
938 {"name": "a", "type": "long", "default": 42},
939 {"name": "b", "type": "string"}
940 ]
941 }
942 "#;
943 let reader_raw_schema = r#"
944 {
945 "type": "record",
946 "name": "test",
947 "fields": [
948 {"name": "a", "type": "long", "default": 42},
949 {"name": "b", "type": "string"},
950 {
951 "name": "c",
952 "type": {
953 "type": "enum",
954 "name": "suit",
955 "symbols": ["diamonds", "spades", "clubs", "hearts"]
956 },
957 "default": "spades"
958 }
959 ]
960 }
961 "#;
962 let writer_schema = Schema::parse_str(writer_raw_schema).unwrap();
963 let reader_schema = Schema::parse_str(reader_raw_schema).unwrap();
964 let mut writer = Writer::with_codec(&writer_schema, Vec::new(), Codec::Null);
965 let mut record = Record::new(writer.schema()).unwrap();
966 record.put("a", 27i64);
967 record.put("b", "foo");
968 writer.append(record).unwrap();
969 let input = writer.into_inner().unwrap();
970 let mut reader = Reader::with_schema(&reader_schema, &input[..]).unwrap();
971 assert_eq!(
972 reader.next().unwrap().unwrap(),
973 Value::Record(vec![
974 ("a".to_string(), Value::Long(27)),
975 ("b".to_string(), Value::String("foo".to_string())),
976 ("c".to_string(), Value::Enum(1, "spades".to_string())),
977 ])
978 );
979 assert!(reader.next().is_none());
980 }
981
982 //TODO: move where it fits better
983 #[test]
984 fn test_enum_string_value() {
985 let raw_schema = r#"
986 {
987 "type": "record",
988 "name": "test",
989 "fields": [
990 {"name": "a", "type": "long", "default": 42},
991 {"name": "b", "type": "string"},
992 {
993 "name": "c",
994 "type": {
995 "type": "enum",
996 "name": "suit",
997 "symbols": ["diamonds", "spades", "clubs", "hearts"]
998 },
999 "default": "spades"
1000 }
1001 ]
1002 }
1003 "#;
1004 let schema = Schema::parse_str(raw_schema).unwrap();
1005 let mut writer = Writer::with_codec(&schema, Vec::new(), Codec::Null);
1006 let mut record = Record::new(writer.schema()).unwrap();
1007 record.put("a", 27i64);
1008 record.put("b", "foo");
1009 record.put("c", "clubs");
1010 writer.append(record).unwrap();
1011 let input = writer.into_inner().unwrap();
1012 let mut reader = Reader::with_schema(&schema, &input[..]).unwrap();
1013 assert_eq!(
1014 reader.next().unwrap().unwrap(),
1015 Value::Record(vec![
1016 ("a".to_string(), Value::Long(27)),
1017 ("b".to_string(), Value::String("foo".to_string())),
1018 ("c".to_string(), Value::Enum(2, "clubs".to_string())),
1019 ])
1020 );
1021 assert!(reader.next().is_none());
1022 }
1023
1024 //TODO: move where it fits better
1025 #[test]
1026 fn test_enum_no_reader_schema() {
1027 let writer_raw_schema = r#"
1028 {
1029 "type": "record",
1030 "name": "test",
1031 "fields": [
1032 {"name": "a", "type": "long", "default": 42},
1033 {"name": "b", "type": "string"},
1034 {
1035 "name": "c",
1036 "type": {
1037 "type": "enum",
1038 "name": "suit",
1039 "symbols": ["diamonds", "spades", "clubs", "hearts"]
1040 },
1041 "default": "spades"
1042 }
1043 ]
1044 }
1045 "#;
1046 let writer_schema = Schema::parse_str(writer_raw_schema).unwrap();
1047 let mut writer = Writer::with_codec(&writer_schema, Vec::new(), Codec::Null);
1048 let mut record = Record::new(writer.schema()).unwrap();
1049 record.put("a", 27i64);
1050 record.put("b", "foo");
1051 record.put("c", "clubs");
1052 writer.append(record).unwrap();
1053 let input = writer.into_inner().unwrap();
1054 let mut reader = Reader::new(&input[..]).unwrap();
1055 assert_eq!(
1056 reader.next().unwrap().unwrap(),
1057 Value::Record(vec![
1058 ("a".to_string(), Value::Long(27)),
1059 ("b".to_string(), Value::String("foo".to_string())),
1060 ("c".to_string(), Value::Enum(2, "clubs".to_string())),
1061 ])
1062 );
1063 }
1064
1065 #[test]
1066 fn test_illformed_length() {
1067 let raw_schema = r#"
1068 {
1069 "type": "record",
1070 "name": "test",
1071 "fields": [
1072 {"name": "a", "type": "long", "default": 42},
1073 {"name": "b", "type": "string"}
1074 ]
1075 }
1076 "#;
1077
1078 let schema = Schema::parse_str(raw_schema).unwrap();
1079
1080 // Would allocated 18446744073709551605 bytes
1081 let illformed: &[u8] = &[0x3e, 0x15, 0xff, 0x1f, 0x15, 0xff];
1082
1083 let value = from_avro_datum(&schema, &mut &*illformed, None);
1084 assert!(value.is_err());
1085 }
1086}