diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index 8513216..0000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,56 +0,0 @@ -name: ci - -on: [push, pull_request] - -jobs: - linux: - name: Test Suite (linux) - runs-on: ubuntu-latest - strategy: - matrix: - rust: - - stable - - nightly - - 1.37.0 - steps: - - uses: actions/checkout@v2 - - uses: actions-rs/toolchain@v1 - with: - toolchain: ${{ matrix.rust }} - - run: cargo test --release --all-features - - windows: - name: Test suite (windows) - runs-on: windows-latest - steps: - - uses: actions/checkout@v2 - with: - toolchain: ${{ matrix.rust }} - - run: cargo test --all-features - - lints: - name: Lints - runs-on: ubuntu-latest - steps: - - name: Checkout sources - uses: actions/checkout@v2 - - - name: Install stable toolchain - uses: actions-rs/toolchain@v1 - with: - profile: minimal - toolchain: stable - override: true - components: rustfmt, clippy - - - name: Run cargo fmt - uses: actions-rs/cargo@v1 - with: - command: fmt - args: --all -- --check - - - name: Run cargo clippy - uses: actions-rs/cargo@v1 - with: - command: clippy - args: -- -D warnings diff --git a/.vscode/launch.json b/.vscode/launch.json deleted file mode 100644 index 224af64..0000000 --- a/.vscode/launch.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - // Use IntelliSense to learn about possible attributes. - // Hover to view descriptions of existing attributes. - // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 - "version": "0.2.0", - "configurations": [ - { - "type": "lldb", - "request": "launch", - "name": "Debug", - "program": "${workspaceFolder}/", - "args": [], - "cwd": "${workspaceFolder}" - } - ] -} \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index dcbef95..0000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,31 +0,0 @@ -# Changelog -All notable changes to this project will be documented in this file. - -The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) -and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - -## [Unreleased] - -## 0.2.1 - 2020-11-27 - -### Fixed -- Named schema resolution outside of union variants. -- Implicitly defined named schemas should resolve in union variants [#6](https://github.com/creativcoder/avrow/issues/6) -- Default values in union schema fields in records should parse correctly [#1](https://github.com/creativcoder/avrow/issues/1) - -### Updated -- Documentation. - -## 0.2.0 - 2020-10-10 - -### Changed - -- Reader takes a reference to the schema. - -## 0.1.0 - 2020-10-08 - -### Added - -Initial implementation of -- avrow -- avrow-cli (av) diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md deleted file mode 100644 index 4ff57d6..0000000 --- a/CODE_OF_CONDUCT.md +++ /dev/null @@ -1,75 +0,0 @@ - -## Code of Conduct - -### Our Pledge - -In the interest of fostering an open and welcoming environment, we as -contributors and maintainers pledge to making participation in our project and -our community a harassment-free experience for everyone, regardless of age, body -size, disability, ethnicity, gender identity and expression, level of experience, -nationality, personal appearance, race, religion, or sexual identity and -orientation. - -### Our Standards - -Examples of behavior that contributes to creating a positive environment -include: - -* Using welcoming and inclusive language -* Being respectful of differing viewpoints and experiences -* Gracefully accepting constructive criticism -* Focusing on what is best for the community -* Showing empathy towards other community members - -Examples of unacceptable behavior by participants include: - -* The use of sexualized language or imagery and unwelcome sexual attention or -advances -* Trolling, insulting/derogatory comments, and personal or political attacks -* Public or private harassment -* Publishing others' private information, such as a physical or electronic - address, without explicit permission -* Other conduct which could reasonably be considered inappropriate in a - professional setting - -### Our Responsibilities - -Project maintainers are responsible for clarifying the standards of acceptable -behavior and are expected to take appropriate and fair corrective action in -response to any instances of unacceptable behavior. - -Project maintainers have the right and responsibility to remove, edit, or -reject comments, commits, code, wiki edits, issues, and other contributions -that are not aligned to this Code of Conduct, or to ban temporarily or -permanently any contributor for other behaviors that they deem inappropriate, -threatening, offensive, or harmful. - -### Scope - -This Code of Conduct applies both within project spaces and in public spaces -when an individual is representing the project or its community. Examples of -representing a project or community include using an official project e-mail -address, posting via an official social media account, or acting as an appointed -representative at an online or offline event. Representation of a project may be -further defined and clarified by project maintainers. - -### Enforcement - -Instances of abusive, harassing, or otherwise unacceptable behavior may be -reported by contacting the project team at [INSERT EMAIL ADDRESS]. All -complaints will be reviewed and investigated and will result in a response that -is deemed necessary and appropriate to the circumstances. The project team is -obligated to maintain confidentiality with regard to the reporter of an incident. -Further details of specific enforcement policies may be posted separately. - -Project maintainers who do not follow or enforce the Code of Conduct in good -faith may face temporary or permanent repercussions as determined by other -members of the project's leadership. - -### Attribution - -This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, -available at [http://contributor-covenant.org/version/1/4][version] - -[homepage]: http://contributor-covenant.org -[version]: http://contributor-covenant.org/version/1/4/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md deleted file mode 100644 index e5225cb..0000000 --- a/CONTRIBUTING.md +++ /dev/null @@ -1,48 +0,0 @@ - -# Contributing - -Some of the features of avrow are feature gated. -While making changes it's a good idea to build and -test with `--all-features` flag. - -## Building the project - -``` -cargo build --all-features -``` - -## Running test cases - -``` -cargo test --all-features -``` - -## Generating and opening documentation locally - -``` -BROWSER=firefox cargo doc --no-deps --open -``` - -When contributing to this repository, please discuss the change you wish to make via issue, -email, or any other method with the owners of this repository before making a change. - -Please note we have a [code of conduct](./CODE_OF_CONDUCT.md), please follow it in all your interactions with the project. - -## Pull Request Process - -Following is a cursory guideline on how to make the process of making changes more efficient for the contributer and the maintainer. - -1. File an issue for the change you want to make. This way we can track the why of the change. - Get consensus from community for the change. -2. Clone the project and perform a fresh build. Create a branch with the naming "feature/issue-number. -3. Ensure that the PR only changes the parts of code which implements/solves the issue. This includes running - the linter (cargo fmt) and removing any extra spaces and any formatting that accidentally were made by - the code editor in use. -4. If your PR has changes that should also reflect in README.md, please update that as well. -5. Document non obvious changes and the `why` of your changes if it's unclear. -6. If you are adding a public API, add the documentation as well. -7. Increase the version numbers in Cargo.toml files and the README.md to the new version that this - Pull Request would represent. The versioning scheme we use is [SemVer](http://semver.org/). -8. Update the CHANGELOG.md to reflect the change if applicable. - -More details: https://github.community/t/best-practices-for-pull-requests/10195 \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml deleted file mode 100644 index 74b4536..0000000 --- a/Cargo.toml +++ /dev/null @@ -1,71 +0,0 @@ -[package] -name = "avrow" -version = "0.2.1" -authors = ["creativcoder "] -edition = "2018" -repository = "https://github.com/creativcoder/avrow" -license = "MIT OR Apache-2.0" -description = "Avrow is a fast, type safe serde based data serialization library" -homepage = "https://github.com/creativcoder/avrow" -documentation = "https://docs.rs/avrow" -readme = "README.md" -keywords = ["avro", "avrow", "rust-avro", "serde-avro","encoding"] -categories = ["encoding", "compression", "command-line-utilities"] - -[dependencies] -serde = {version= "1", features=["derive"] } -serde_derive = "1" -serde_json = { version="1", features=["preserve_order"] } -rand = "0.4.2" -byteorder = "1" -integer-encoding = "2" -snap = { version = "0.2", optional = true } -flate2 = { version = "1", features = ["zlib"], default-features = false, optional = true } -crc = "1" -thiserror = "1.0" -indexmap = {version = "1", features = ["serde-1"]} -once_cell = "1.4.1" -zstdd = { version = "0.5.3", optional = true, package="zstd" } -bzip2 = { version = "0.4.1", optional = true } -xz2 = { version = "0.1", optional = true } -shatwo = { version = "0.9.1", optional = true, package="sha2" } -mdfive = { version = "0.7.0", optional = true, package="md5" } - -[dev-dependencies] -criterion = "0.2" -pretty_env_logger = "0.4" -fstrings = "0.2" -env_logger = "0.4" -anyhow = "1.0.32" - -[[bench]] -name = "primitives" -harness = false - -[[bench]] -name = "complex" -harness = false - -[[bench]] -name = "schema" -harness = false - -[features] -# compression codecs -snappy = ["snap"] -deflate = ["flate2"] -zstd = ["zstdd"] -bzip = ["bzip2"] -xz = ["xz2"] -# fingerprint codecs -sha2 = ["shatwo"] -md5 = ["mdfive"] - -codec = ["snappy", "deflate", "zstd", "bzip2", "xz"] -fingerprint = ["sha2", "md5"] -all = ["codec", "fingerprint"] - -[profile.release] -opt-level = 'z' -lto = true -codegen-units = 1 diff --git a/LICENSE-APACHE b/LICENSE-APACHE deleted file mode 100644 index 16fe87b..0000000 --- a/LICENSE-APACHE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - -TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - -1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - -2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - -3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - -4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - -5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - -6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - -7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - -8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - -9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - -END OF TERMS AND CONDITIONS - -APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - -Copyright [yyyy] [name of copyright owner] - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. diff --git a/LICENSE-MIT b/LICENSE-MIT deleted file mode 100644 index 31aa793..0000000 --- a/LICENSE-MIT +++ /dev/null @@ -1,23 +0,0 @@ -Permission is hereby granted, free of charge, to any -person obtaining a copy of this software and associated -documentation files (the "Software"), to deal in the -Software without restriction, including without -limitation the rights to use, copy, modify, merge, -publish, distribute, sublicense, and/or sell copies of -the Software, and to permit persons to whom the Software -is furnished to do so, subject to the following -conditions: - -The above copyright notice and this permission notice -shall be included in all copies or substantial portions -of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF -ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED -TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A -PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT -SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION -OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR -IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md deleted file mode 100644 index 1c7418e..0000000 --- a/README.md +++ /dev/null @@ -1,425 +0,0 @@ -
- avrow - -[![Actions Status](https://github.com/creativcoder/avrow/workflows/ci/badge.svg)](https://github.com/creativcoder/avrow/actions) -[![crates](https://img.shields.io/crates/v/avrow.svg)](https://crates.io/crates/avrow) -[![docs.rs](https://docs.rs/avrow/badge.svg)](https://docs.rs/avrow/) -[![license](https://img.shields.io/badge/License-MIT-blue.svg)](https://github.com/creativcoder/avrow/blob/master/LICENSE-MIT) -[![license](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/creativcoder/avrow/blob/master/LICENSE-APACHE) -[![Contributor Covenant](https://img.shields.io/badge/contributor%20covenant-v1.4%20adopted-ff69b4.svg)](CODE_OF_CONDUCT.md) - -
-
- - -### Avrow is a pure Rust implementation of the [Avro specification](https://avro.apache.org/docs/current/spec.html) with [Serde](https://github.com/serde-rs/serde) support. - - -
-
- -
- -### Table of Contents -- [Overview](#overview) -- [Features](#features) -- [Getting started](#getting-started) -- [Examples](#examples) - - [Writing avro data](#writing-avro-data) - - [Reading avro data](#reading-avro-data) - - [Writer builder](#writer-customization) -- [Supported Codecs](#supported-codecs) -- [Using the avrow-cli tool](#using-avrow-cli-tool) -- [Benchmarks](#benchmarks) -- [Todo](#todo) -- [Changelog](#changelog) -- [Contributions](#contributions) -- [Support](#support) -- [MSRV](#msrv) -- [License](#license) - -## Overview - -Avrow is a pure Rust implementation of the [Avro specification](https://avro.apache.org/docs/current/spec.html): a row based data serialization system. The Avro data serialization format finds its use quite a lot in big data streaming systems such as [Kafka](https://kafka.apache.org/) and [Spark](https://spark.apache.org/). -Within avro's context, an avro encoded file or byte stream is called a "data file". -To write data in avro encoded format, one needs a schema which is provided in json format. Here's an example of an avro schema represented in json: - -```json -{ - "type": "record", - "name": "LongList", - "aliases": ["LinkedLongs"], - "fields" : [ - {"name": "value", "type": "long"}, - {"name": "next", "type": ["null", "LongList"]} - ] -} -``` -The above schema is of type record with fields and represents a linked list of 64-bit integers. In most implementations, this schema is then fed to a `Writer` instance along with a buffer to write encoded data to. One can then call one -of the `write` methods on the writer to write data. One distinguishing aspect of avro is that the schema for the encoded data is written on the header of the data file. This means that for reading data you don't need to provide a schema to a `Reader` instance. The spec also allows providing a reader schema to filter data when reading. - -The Avro specification provides two kinds of encoding: -* Binary encoding - Efficent and takes less space on disk. -* JSON encoding - When you want a readable version of avro encoded data. Also used for debugging purposes. - -This crate implements only the binary encoding as that's the format practically used for performance and storage reasons. - -## Features - -* Full support for recursive self-referential schemas with Serde serialization/deserialization. -* All compressions codecs (`deflate`, `bzip2`, `snappy`, `xz`, `zstd`) supported as per spec. -* Simple and intuitive API - As the underlying structures in use are `Read` and `Write` types, avrow tries to mimic the same APIs as Rust's standard library APIs for minimal learning overhead. Writing avro values is simply calling `write` or `serialize` (with serde) and reading avro values is simply using iterators. -* Less bloat / Lightweight - Compile times in Rust are costly. Avrow tries to use minimal third-party crates. Compression codec and schema fingerprinting support are feature gated by default. To use them, compile with respective feature flags (e.g. `--features zstd`). -* Schema evolution - One can configure the avrow `Reader` with a reader schema and only read data relevant to their use case. -* Schema's in avrow supports querying their canonical form and have fingerprinting (`rabin64`, `sha256`, `md5`) support. - -**Note**: This is not a complete spec implemention and remaining features being implemented are listed under [Todo](#todo) section. - -## Getting started: - -Add avrow as a dependency to `Cargo.toml`: - -```toml -[dependencies] -avrow = "0.2.0" -``` - -## Examples: - -### Writing avro data - -```rust - -use anyhow::Error; -use avrow::{Schema, Writer}; -use std::str::FromStr; - -fn main() -> Result<(), Error> { - // Create schema from json - let schema = Schema::from_str(r##"{"type":"string"}"##)?; - // or from a path - let schema2 = Schema::from_path("./string_schema.avsc")?; - // Create an output stream - let stream = Vec::new(); - // Create a writer - let writer = Writer::new(&schema, stream.as_slice())?; - // Write your data! - let res = writer.write("Hey")?; - // or using serialize method for serde derived types. - let res = writer.serialize("there!")?; - - Ok(()) -} - -``` -For simple and native Rust types, avrow provides a `From` impl to convert to Avro value types. For compound or user defined types (structs or enums), one can use the `serialize` method which relies on serde. Alternatively, one can construct `avrow::Value` instances which is a more verbose way to write avro values and should be a last resort. - -### Reading avro data - -```rust -fn main() -> Result<(), Error> { - let schema = Schema::from_str(r##""null""##); - let data = vec![ - 79, 98, 106, 1, 4, 22, 97, 118, 114, 111, 46, 115, 99, 104, 101, - 109, 97, 32, 123, 34, 116, 121, 112, 101, 34, 58, 34, 98, 121, 116, - 101, 115, 34, 125, 20, 97, 118, 114, 111, 46, 99, 111, 100, 101, - 99, 14, 100, 101, 102, 108, 97, 116, 101, 0, 145, 85, 112, 15, 87, - 201, 208, 26, 183, 148, 48, 236, 212, 250, 38, 208, 2, 18, 227, 97, - 96, 100, 98, 102, 97, 5, 0, 145, 85, 112, 15, 87, 201, 208, 26, - 183, 148, 48, 236, 212, 250, 38, 208, - ]; - // Create a Reader - let reader = Reader::with_schema(v.as_slice(), &schema)?; - for i in reader { - dbg!(&i); - } - - Ok(()) -} - -``` - -### Self-referential recursive schema example - -```rust -use anyhow::Error; -use avrow::{from_value, Codec, Reader, Schema, Writer}; -use serde::{Deserialize, Serialize}; - -#[derive(Debug, Serialize, Deserialize)] -struct LongList { - value: i64, - next: Option>, -} - -fn main() -> Result<(), Error> { - let schema = r##" - { - "type": "record", - "name": "LongList", - "aliases": ["LinkedLongs"], - "fields" : [ - {"name": "value", "type": "long"}, - {"name": "next", "type": ["null", "LongList"]} - ] - } - "##; - - let schema = Schema::from_str(schema)?; - let mut writer = Writer::with_codec(&schema, vec![], Codec::Null)?; - - let value = LongList { - value: 1i64, - next: Some(Box::new(LongList { - value: 2i64, - next: Some(Box::new(LongList { - value: 3i64, - next: Some(Box::new(LongList { - value: 4i64, - next: Some(Box::new(LongList { - value: 5i64, - next: None, - })), - })), - })), - })), - }; - - writer.serialize(value)?; - - // Calling into_inner performs flush internally. Alternatively, one can call flush explicitly. - let buf = writer.into_inner()?; - - // read - let reader = Reader::with_schema(buf.as_slice(), &schema)?; - for i in reader { - let a: LongList = from_value(&i)?; - dbg!(a); - } - - Ok(()) -} - -``` - -### An example of writing a json object with a confirming schema. The json object maps to the `avrow::Record` type. - -```rust -use anyhow::Error; -use avrow::{from_value, Reader, Record, Schema, Writer}; -use serde::{Deserialize, Serialize}; -use std::str::FromStr; - -#[derive(Debug, Serialize, Deserialize)] -struct Mentees { - id: i32, - username: String, -} - -#[derive(Debug, Serialize, Deserialize)] -struct RustMentors { - name: String, - github_handle: String, - active: bool, - mentees: Mentees, -} - -fn main() -> Result<(), Error> { - let schema = Schema::from_str( - r##" - { - "name": "rust_mentors", - "type": "record", - "fields": [ - { - "name": "name", - "type": "string" - }, - { - "name": "github_handle", - "type": "string" - }, - { - "name": "active", - "type": "boolean" - }, - { - "name":"mentees", - "type": { - "name":"mentees", - "type": "record", - "fields": [ - {"name":"id", "type": "int"}, - {"name":"username", "type": "string"} - ] - } - } - ] - } -"##, - )?; - - let json_data = serde_json::from_str( - r##" - { "name": "bob", - "github_handle":"ghbob", - "active": true, - "mentees":{"id":1, "username":"alice"} }"##, - )?; - let rec = Record::from_json(json_data, &schema)?; - let mut writer = crate::Writer::new(&schema, vec![])?; - writer.write(rec)?; - - let avro_data = writer.into_inner()?; - let reader = crate::Reader::new(avro_data.as_slice())?; - for value in reader { - let mentors: RustMentors = from_value(&value)?; - dbg!(mentors); - } - Ok(()) -} - -``` - -### Writer customization - -If you want to have more control over the parameters of `Writer`, consider using `WriterBuilder` as shown below: - -```rust - -use anyhow::Error; -use avrow::{Codec, Reader, Schema, WriterBuilder}; - -fn main() -> Result<(), Error> { - let schema = Schema::from_str(r##""null""##)?; - let v = vec![]; - let mut writer = WriterBuilder::new() - .set_codec(Codec::Null) - .set_schema(&schema) - .set_datafile(v) - // set any custom metadata in the header - .set_metadata("hello", "world") - // set after how many bytes, the writer should flush - .set_flush_interval(128_000) - .build() - .unwrap(); - writer.serialize(())?; - let v = writer.into_inner()?; - - let reader = Reader::with_schema(v.as_slice(), schema)?; - for i in reader { - dbg!(i?); - } - - Ok(()) -} -``` - -Refer to [examples](./examples) for more code examples. - -## Supported Codecs - -In order to facilitate efficient encoding, avro spec also defines compression codecs to use when serializing data. - -Avrow supports all compression codecs as per spec: - -- Null - The default is no codec. -- [Deflate](https://en.wikipedia.org/wiki/DEFLATE) -- [Snappy](https://github.com/google/snappy) -- [Zstd](https://facebook.github.io/zstd/) -- [Bzip2](https://www.sourceware.org/bzip2/) -- [Xz](https://linux.die.net/man/1/xz) - -These are feature-gated behind their respective flags. Check `Cargo.toml` `features` section for more details. - -## Using avrow-cli tool: - -Quite often you will need a quick way to examine avro file for debugging purposes. -For that, this repository also comes with the [`avrow-cli`](./avrow-cli) tool (av) -by which one can examine avro datafiles from the command line. - -See [avrow-cli](avrow-cli/) repository for more details. - -Installing avrow-cli: - -``` -cd avrow-cli -cargo install avrow-cli -``` - -Using avrow-cli (binary name is `av`): - -```bash -av read -d data.avro -``` - -The `read` subcommand will print all rows in `data.avro` to standard out in debug format. - -### Rust native types to Avro value mapping (via Serde) - -Primitives ---- - -| Rust native types (primitive types) | Avro (`Value`) | -| ----------------------------------- | -------------- | -| `(), Option::None` | `null` | -| `bool` | `boolean` | -| `i8, u8, i16, u16, i32, u32` | `int` | -| `i64, u64` | `long` | -| `f32` | `float` | -| `f64` | `double` | -| `&[u8], Vec` | `bytes` | -| `&str, String` | `string` | ---- -Complex - -| Rust native types (complex types) | Avro | -| ---------------------------------------------------- | -------- | -| `struct Foo {..}` | `record` | -| `enum Foo {A,B}` (variants cannot have data in them) | `enum` | -| `Vec where T: Into` | `array` | -| `HashMap where T: Into` | `map` | -| `T where T: Into` | `union` | -| `Vec` : Length equal to size defined in schema | `fixed` | - -
- -## Todo - -* [Logical types](https://avro.apache.org/docs/current/spec.html#Logical+Types) support. -* Sorted reads. -* Single object encoding. -* Schema Registry as a trait - would allow avrow to read from and write to remote schema registries. -* AsyncRead + AsyncWrite Reader and Writers. -* Avro protocol message and RPC support. -* Benchmarks and optimizations. - -## Changelog - -Please see the [CHANGELOG](CHANGELOG.md) for a release history. - -## Contributions - -All kinds of contributions are welcome. - -Head over to [CONTRIBUTING.md](CONTRIBUTING.md) for contribution guidelines. - -## Support - -Buy Me A Coffee - -[![ko-fi](https://www.ko-fi.com/img/githubbutton_sm.svg)](https://ko-fi.com/P5P71YZ0L) - -## MSRV - -Avrow works on stable Rust, starting 1.37+. -It does not use any nightly features. - -## License - -Dual licensed under either of Apache License, Version -2.0 or MIT license at your option. - -Unless you explicitly state otherwise, any contribution intentionally submitted -for inclusion in this crate by you, as defined in the Apache-2.0 license, shall -be dual licensed as above, without any additional terms or conditions. diff --git a/avrow-cli/Cargo.toml b/avrow-cli/Cargo.toml deleted file mode 100644 index 574639c..0000000 --- a/avrow-cli/Cargo.toml +++ /dev/null @@ -1,16 +0,0 @@ -[package] -name = "avrow-cli" -version = "0.1.0" -authors = ["creativcoder "] -edition = "2018" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[dependencies] -avrow = { path = "../../avrow", features=["all"] } -anyhow = "1.0.32" -structopt = "0.3.20" - -[[bin]] -name = "av" -path="src/main.rs" diff --git a/avrow-cli/README.md b/avrow-cli/README.md deleted file mode 100644 index 4c05ccc..0000000 --- a/avrow-cli/README.md +++ /dev/null @@ -1,34 +0,0 @@ - -## Avrow-cli - command line tool to examine avro files [WIP] - -Inspired from avro-tools.jar - -## Install - -``` -cargo install --path . -``` -This will install the binary as `av`. - -### Following subcommands are the supported as of now. - -``` -avrow-cli 0.1.0 -Command line tool for examining avro datafiles - -USAGE: - av - -FLAGS: - -h, --help Prints help information - -V, --version Prints version information - -SUBCOMMANDS: - bytes Dumps the avro datafile as bytes for debugging purposes - canonical Prints the canonical form of writer's schema encoded in the avro datafile. - fingerprint Prints fingerprint of the canonical form of writer's schema in the avro datafile. - help Prints this message or the help of the given subcommand(s) - metadata Get metadata information of the avro datafile - read Prints data in the avro datafile in debug format - schema Prints the writer's schema encoded in the avro datafile -``` diff --git a/avrow-cli/src/main.rs b/avrow-cli/src/main.rs deleted file mode 100644 index 8004346..0000000 --- a/avrow-cli/src/main.rs +++ /dev/null @@ -1,82 +0,0 @@ -//! avrow-cli is a command line tool to examine and analyze avro data files. -//! -//! Usage: `av -d read` -//! -//! The above command prints the data contained in the in a readable format. -//! - -mod subcommand; -mod utils; - -use std::path::PathBuf; -use structopt::StructOpt; -use subcommand::{bytes, canonical, fingerprint, metadata, read, schema}; -use utils::read_datafile; - -#[derive(StructOpt, Debug)] -#[structopt(about = "Command line tool for examining avro datafiles")] -enum AvrowCli { - #[structopt( - name = "metadata", - about = "Get metadata information of the avro datafile" - )] - Metadata { - #[structopt(short)] - datafile: PathBuf, - }, - #[structopt( - name = "schema", - about = "Prints the writer's schema encoded in the avro datafile" - )] - Schema { - #[structopt(short)] - datafile: PathBuf, - }, - #[structopt( - about = "Prints fingerprint of the canonical form of writer's schema in the avro datafile." - )] - Fingerprint { - #[structopt(short)] - datafile: PathBuf, - #[structopt(short)] - fingerprint: String, - }, - #[structopt( - about = "Prints the canonical form of writer's schema encoded in the avro datafile." - )] - Canonical { - #[structopt(short)] - datafile: PathBuf, - }, - #[structopt(about = "Prints data in the avro datafile in debug format")] - Read { - #[structopt(short)] - datafile: PathBuf, - }, - #[structopt( - name = "bytes", - about = "Dumps the avro datafile as bytes for debugging purposes" - )] - Bytes { - #[structopt(short)] - datafile: PathBuf, - }, -} - -fn main() -> anyhow::Result<()> { - use AvrowCli::*; - let opt = AvrowCli::from_args(); - match opt { - Metadata { datafile } => metadata(&datafile)?, - Schema { datafile } => schema(&datafile)?, - Canonical { datafile } => canonical(&datafile)?, - Read { datafile } => read(&datafile)?, - Bytes { datafile } => bytes(&datafile)?, - Fingerprint { - datafile, - fingerprint: fp, - } => fingerprint(&datafile, &fp)?, - } - - Ok(()) -} diff --git a/avrow-cli/src/subcommand.rs b/avrow-cli/src/subcommand.rs deleted file mode 100644 index 647b363..0000000 --- a/avrow-cli/src/subcommand.rs +++ /dev/null @@ -1,89 +0,0 @@ -use crate::read_datafile; -use anyhow::{anyhow, Context}; -use avrow::{Header, Reader}; -use std::io::Read; -use std::path::PathBuf; -use std::str; - -pub fn metadata(datafile: &PathBuf) -> Result<(), anyhow::Error> { - let mut avro_datafile = read_datafile(datafile)?; - let header = Header::from_reader(&mut avro_datafile)?; - for (k, v) in header.metadata() { - print!("{}\t", k); - println!( - "{}", - str::from_utf8(v).expect("Invalid UTF-8 in avro header") - ); - } - Ok(()) -} - -pub fn read(datafile: &PathBuf) -> Result<(), anyhow::Error> { - let mut avro_datafile = read_datafile(datafile)?; - let reader = Reader::new(&mut avro_datafile)?; - // TODO: remove irrelevant fields - for i in reader { - println!("{:?}", i?); - } - - Ok(()) -} - -pub fn bytes(datafile: &PathBuf) -> Result<(), anyhow::Error> { - let mut avro_datafile = read_datafile(datafile)?; - let mut v = vec![]; - - avro_datafile - .read_to_end(&mut v) - .with_context(|| "Failed to read datafile")?; - - println!("{:?}", v); - Ok(()) -} - -pub fn schema(datafile: &PathBuf) -> Result<(), anyhow::Error> { - let mut avro_datafile = read_datafile(datafile)?; - let header = Header::from_reader(&mut avro_datafile)?; - // TODO print human readable schema - println!("{}", header.schema()); - Ok(()) -} - -pub fn fingerprint(datafile: &PathBuf, fingerprint: &str) -> Result<(), anyhow::Error> { - let mut avro_datafile = read_datafile(datafile)?; - let header = Header::from_reader(&mut avro_datafile)?; - match fingerprint.as_ref() { - "rabin64" => { - println!("0x{:x}", header.schema().canonical_form().rabin64()); - } - "sha256" => { - let mut fingerprint_str = String::new(); - let sha256 = header.schema().canonical_form().sha256(); - for i in sha256 { - let a = format!("{:x}", i); - fingerprint_str.push_str(&a); - } - - println!("{}", fingerprint_str); - } - "md5" => { - let mut fingerprint_str = String::new(); - let md5 = header.schema().canonical_form().md5(); - for i in md5 { - let a = format!("{:x}", i); - fingerprint_str.push_str(&a); - } - - println!("{}", fingerprint_str); - } - other => return Err(anyhow!("invalid or unsupported fingerprint: {}", other)), - } - Ok(()) -} - -pub fn canonical(datafile: &PathBuf) -> Result<(), anyhow::Error> { - let mut avro_datafile = read_datafile(datafile)?; - let header = Header::from_reader(&mut avro_datafile)?; - println!("{}", header.schema().canonical_form()); - Ok(()) -} diff --git a/avrow-cli/src/utils.rs b/avrow-cli/src/utils.rs deleted file mode 100644 index 2b63045..0000000 --- a/avrow-cli/src/utils.rs +++ /dev/null @@ -1,11 +0,0 @@ -use anyhow::Context; -use anyhow::Result; -use std::path::Path; - -// Open an avro datafile for reading avro data -pub(crate) fn read_datafile>(path: P) -> Result { - std::fs::OpenOptions::new() - .read(true) - .open(path) - .with_context(|| "Could not read datafile") -} diff --git a/assets/avrow_logo.png b/avrow_logo.png similarity index 100% rename from assets/avrow_logo.png rename to avrow_logo.png diff --git a/benches/complex.rs b/benches/complex.rs deleted file mode 100644 index 3f8794a..0000000 --- a/benches/complex.rs +++ /dev/null @@ -1,150 +0,0 @@ -extern crate avrow; -extern crate serde; -#[macro_use] -extern crate serde_derive; - -#[macro_use] -extern crate criterion; - -use avrow::Codec; -use avrow::Schema; -use avrow::Writer; -use criterion::Criterion; -use std::str::FromStr; - -#[derive(Debug, Serialize, Deserialize)] -struct LongList { - value: i64, - next: Option>, -} - -fn simple_record(c: &mut Criterion) { - c.bench_function("simple_record", |b| { - let schema = Schema::from_str( - r##"{ - "namespace": "atherenergy.vcu_cloud_connect", - "type": "record", - "name": "can_raw", - "fields" : [ - {"name": "one", "type": "int"}, - {"name": "two", "type": "long"}, - {"name": "three", "type": "long"}, - {"name": "four", "type": "int"}, - {"name": "five", "type": "long"} - ] - }"##, - ) - .unwrap(); - let v = vec![]; - let mut writer = Writer::with_codec(&schema, v, Codec::Null).unwrap(); - b.iter(|| { - for _ in 0..1000 { - let data = Data { - one: 34, - two: 334, - three: 45765, - four: 45643, - five: 834, - }; - - writer.serialize(data).unwrap(); - } - - // batch and write data - writer.flush().unwrap(); - }); - }); -} - -#[derive(Serialize, Deserialize)] -struct Data { - one: u32, - two: u64, - three: u64, - four: u32, - five: u64, -} - -fn array_record(c: &mut Criterion) { - c.bench_function("Array of records", |b| { - let schema = Schema::from_str( - r##"{"type": "array", "items": { - "namespace": "atherenergy.vcu_cloud_connect", - "type": "record", - "name": "can_raw", - "fields" : [ - {"name": "one", "type": "int"}, - {"name": "two", "type": "long"}, - {"name": "three", "type": "long"}, - {"name": "four", "type": "int"}, - {"name": "five", "type": "long"} - ] - }}"##, - ) - .unwrap(); - let mut v = vec![]; - let mut writer = Writer::with_codec(&schema, &mut v, Codec::Null).unwrap(); - b.iter(|| { - let mut can_array = vec![]; - for _ in 0..1000 { - let data = Data { - one: 34, - two: 334, - three: 45765, - four: 45643, - five: 834, - }; - - can_array.push(data); - } - - // batch and write data - writer.serialize(can_array).unwrap(); - writer.flush().unwrap(); - }); - }); -} - -fn nested_recursive_record(c: &mut Criterion) { - c.bench_function("recursive_nested_record", |b| { - let schema = r##" - { - "type": "record", - "name": "LongList", - "aliases": ["LinkedLongs"], - "fields" : [ - {"name": "value", "type": "long"}, - {"name": "next", "type": ["null", "LongList"]} - ] - } - "##; - - let schema = Schema::from_str(schema).unwrap(); - let mut writer = Writer::with_codec(&schema, vec![], Codec::Null).unwrap(); - - b.iter(|| { - for _ in 0..1000 { - let value = LongList { - value: 1i64, - next: Some(Box::new(LongList { - value: 2, - next: Some(Box::new(LongList { - value: 3, - next: None, - })), - })), - }; - writer.serialize(value).unwrap(); - } - }); - writer.flush().unwrap(); - }); -} - -criterion_group!( - benches, - nested_recursive_record, - array_record, - simple_record -); -criterion_main!(benches); diff --git a/benches/primitives.rs b/benches/primitives.rs deleted file mode 100644 index 9651535..0000000 --- a/benches/primitives.rs +++ /dev/null @@ -1,149 +0,0 @@ -extern crate avrow; - -#[macro_use] -extern crate criterion; - -use criterion::Criterion; - -use avrow::from_value; -use avrow::Reader; -use avrow::Schema; -use avrow::Writer; -use std::str::FromStr; - -fn criterion_benchmark(c: &mut Criterion) { - // Write benchmarks - c.bench_function("write_null", |b| { - let schema = Schema::from_str(r##"{"type": "null" }"##).unwrap(); - let mut out = vec![]; - let mut writer = Writer::new(&schema, &mut out).unwrap(); - - b.iter(|| { - for _ in 0..100_000 { - writer.write(()).unwrap(); - } - }); - - writer.flush().unwrap(); - }); - - c.bench_function("write_boolean", |b| { - let schema = Schema::from_str(r##"{"type": "boolean" }"##).unwrap(); - let mut out = vec![]; - let mut writer = Writer::new(&schema, &mut out).unwrap(); - - b.iter(|| { - for i in 0..100_000 { - writer.write(i % 2 == 0).unwrap(); - } - }); - - writer.flush().unwrap(); - }); - - c.bench_function("write_int", |b| { - let schema = Schema::from_str(r##"{"type": "int" }"##).unwrap(); - let mut out = vec![]; - let mut writer = Writer::new(&schema, &mut out).unwrap(); - - b.iter(|| { - for _ in 0..100_000 { - writer.write(45).unwrap(); - } - }); - - writer.flush().unwrap(); - }); - - c.bench_function("write_long", |b| { - let schema = Schema::from_str(r##"{"type": "long" }"##).unwrap(); - let mut out = vec![]; - let mut writer = Writer::new(&schema, &mut out).unwrap(); - - b.iter(|| { - for _ in 0..100_000 { - writer.write(45i64).unwrap(); - } - }); - - writer.flush().unwrap(); - }); - - c.bench_function("write_float", |b| { - let schema = Schema::from_str(r##"{"type": "float" }"##).unwrap(); - let mut out = vec![]; - let mut writer = Writer::new(&schema, &mut out).unwrap(); - - b.iter(|| { - for _ in 0..100_000 { - writer.write(45.0f32).unwrap(); - } - }); - - writer.flush().unwrap(); - }); - - c.bench_function("write_double", |b| { - let schema = Schema::from_str(r##"{"type": "double" }"##).unwrap(); - let mut out = vec![]; - let mut writer = Writer::new(&schema, &mut out).unwrap(); - - b.iter(|| { - for _ in 0..100_000 { - writer.write(45.0f64).unwrap(); - } - }); - - writer.flush().unwrap(); - }); - - c.bench_function("write_bytes", |b| { - let schema = Schema::from_str(r##"{"type": "bytes" }"##).unwrap(); - let mut out = vec![]; - let mut writer = Writer::new(&schema, &mut out).unwrap(); - - b.iter(|| { - for _ in 0..100_000 { - let v = vec![0u8, 1, 2, 3]; - writer.write(v).unwrap(); - } - }); - - writer.flush().unwrap(); - }); - - c.bench_function("write_string", |b| { - let schema = Schema::from_str(r##"{"type": "string" }"##).unwrap(); - let mut out = vec![]; - let mut writer = Writer::new(&schema, &mut out).unwrap(); - - b.iter(|| { - for _ in 0..100_000 { - writer.write("hello").unwrap(); - } - }); - - writer.flush().unwrap(); - }); - - // Read benchmarks - c.bench_function("avro_read_bytes_from_vec", |b| { - let avro_data = vec![ - 79, 98, 106, 1, 4, 22, 97, 118, 114, 111, 46, 115, 99, 104, 101, 109, 97, 32, 123, 34, - 116, 121, 112, 101, 34, 58, 34, 98, 121, 116, 101, 115, 34, 125, 20, 97, 118, 114, 111, - 46, 99, 111, 100, 101, 99, 8, 110, 117, 108, 108, 0, 149, 158, 112, 231, 150, 73, 245, - 11, 130, 6, 13, 141, 239, 19, 146, 71, 2, 14, 12, 0, 1, 2, 3, 4, 5, 149, 158, 112, 231, - 150, 73, 245, 11, 130, 6, 13, 141, 239, 19, 146, 71, - ]; - - b.iter(|| { - let reader = Reader::new(avro_data.as_slice()).unwrap(); - for i in reader { - let _: Vec = from_value(&i).unwrap(); - } - }); - }); -} - -criterion_group!(benches, criterion_benchmark); -criterion_main!(benches); diff --git a/benches/schema.rs b/benches/schema.rs deleted file mode 100644 index 61b3355..0000000 --- a/benches/schema.rs +++ /dev/null @@ -1,61 +0,0 @@ -#[macro_use] -extern crate criterion; -extern crate avrow; - -use criterion::criterion_group; -use criterion::Criterion; -use std::str::FromStr; - -use avrow::Schema; - -fn parse_enum_schema() { - let _ = Schema::from_str( - r##"{ "type": "enum", - "name": "Suit", - "symbols" : ["SPADES", "HEARTS", "DIAMONDS", "CLUBS"] - }"##, - ) - .unwrap(); -} - -fn parse_string_schema() { - let _ = Schema::from_str(r##""string""##).unwrap(); -} - -fn parse_record_schema(c: &mut Criterion) { - c.bench_function("parse_record_schema", |b| { - b.iter(|| { - let _ = Schema::from_str( - r##"{ - "namespace": "sensor_data", - "type": "record", - "name": "can", - "fields" : [ - {"name": "can_id", "type": "int"}, - {"name": "data", "type": "long"}, - {"name": "timestamp", "type": "double"}, - {"name": "seq_num", "type": "int"}, - {"name": "global_seq", "type": "long"} - ] - }"##, - ) - .unwrap(); - }); - }); -} - -fn bench_string_schema(c: &mut Criterion) { - c.bench_function("parse string schema", |b| b.iter(parse_string_schema)); -} - -fn bench_enum_schema(c: &mut Criterion) { - c.bench_function("parse enum schema", |b| b.iter(parse_enum_schema)); -} - -criterion_group!( - benches, - bench_string_schema, - bench_enum_schema, - parse_record_schema -); -criterion_main!(benches); diff --git a/examples/canonical.rs b/examples/canonical.rs deleted file mode 100644 index 9b7293c..0000000 --- a/examples/canonical.rs +++ /dev/null @@ -1,24 +0,0 @@ -use anyhow::Error; -use avrow::Schema; -use std::str::FromStr; - -fn main() -> Result<(), Error> { - let schema = Schema::from_str( - r##" - { - "type": "record", - "name": "LongList", - "aliases": ["LinkedLongs"], - "fields" : [ - {"name": "value", "type": "long"}, - {"name": "next", "type": ["null", "LongList"] - }] - } - "##, - ) - .unwrap(); - println!("{}", schema.canonical_form()); - // get the rabin fingerprint of the canonical form. - dbg!(schema.canonical_form().rabin64()); - Ok(()) -} diff --git a/examples/from_json_to_struct.rs b/examples/from_json_to_struct.rs deleted file mode 100644 index 0938a6d..0000000 --- a/examples/from_json_to_struct.rs +++ /dev/null @@ -1,72 +0,0 @@ -use anyhow::Error; -use avrow::{from_value, Reader, Record, Schema, Writer}; -use serde::{Deserialize, Serialize}; -use std::str::FromStr; -#[derive(Debug, Serialize, Deserialize)] -struct Mentees { - id: i32, - username: String, -} - -#[derive(Debug, Serialize, Deserialize)] -struct RustMentors { - name: String, - github_handle: String, - active: bool, - mentees: Mentees, -} - -fn main() -> Result<(), Error> { - let schema = Schema::from_str( - r##" - { - "name": "rust_mentors", - "type": "record", - "fields": [ - { - "name": "name", - "type": "string" - }, - { - "name": "github_handle", - "type": "string" - }, - { - "name": "active", - "type": "boolean" - }, - { - "name":"mentees", - "type": { - "name":"mentees", - "type": "record", - "fields": [ - {"name":"id", "type": "int"}, - {"name":"username", "type": "string"} - ] - } - } - ] - } -"##, - )?; - - let json_data = serde_json::from_str( - r##" - { "name": "bob", - "github_handle":"ghbob", - "active": true, - "mentees":{"id":1, "username":"alice"} }"##, - )?; - let rec = Record::from_json(json_data, &schema)?; - let mut writer = crate::Writer::new(&schema, vec![])?; - writer.write(rec)?; - - let avro_data = writer.into_inner()?; - let reader = Reader::new(avro_data.as_slice())?; - for value in reader { - let mentors: RustMentors = from_value(&value)?; - dbg!(mentors); - } - Ok(()) -} diff --git a/examples/hello_world.rs b/examples/hello_world.rs deleted file mode 100644 index 4b2f5fd..0000000 --- a/examples/hello_world.rs +++ /dev/null @@ -1,41 +0,0 @@ -// A hello world example of reading and writing avro data files - -use anyhow::Error; -use avrow::from_value; -use avrow::Reader; -use avrow::Schema; -use avrow::Writer; -use std::str::FromStr; - -use std::io::Cursor; - -fn main() -> Result<(), Error> { - // Writing data - - // Create a schema - let schema = Schema::from_str(r##""null""##)?; - // Create writer using schema and provide a buffer (implements Read) to write to - let mut writer = Writer::new(&schema, vec![])?; - // Write the data using write and creating a Value manually. - writer.write(())?; - // or the more convenient and intuitive serialize method that takes native Rust types. - writer.serialize(())?; - // retrieve the underlying buffer using the buffer method. - // TODO buffer is not intuive when using a file. into_inner is much better here. - let buf = writer.into_inner()?; - - // Reading data - - // Create Reader by providing a Read wrapped version of `buf` - let reader = Reader::new(Cursor::new(buf))?; - // Use iterator for reading data in an idiomatic manner. - for i in reader { - // reading values can fail due to decoding errors, so the return value of iterator is a Option> - // it allows one to examine the failure reason on the underlying avro reader. - dbg!(&i); - // This value can be converted to a native Rust type using `from_value` method that uses serde underneath. - let _val: () = from_value(&i)?; - } - - Ok(()) -} diff --git a/examples/recursive_record.rs b/examples/recursive_record.rs deleted file mode 100644 index c1b1274..0000000 --- a/examples/recursive_record.rs +++ /dev/null @@ -1,56 +0,0 @@ -use anyhow::Error; -use avrow::{from_value, Codec, Reader, Schema, Writer}; -use serde::{Deserialize, Serialize}; -use std::str::FromStr; - -#[derive(Debug, Serialize, Deserialize)] -struct LongList { - value: i64, - next: Option>, -} - -fn main() -> Result<(), Error> { - let schema = r##" - { - "type": "record", - "name": "LongList", - "aliases": ["LinkedLongs"], - "fields" : [ - {"name": "value", "type": "long"}, - {"name": "next", "type": ["null", "LongList"]} - ] - } - "##; - - let schema = Schema::from_str(schema)?; - let mut writer = Writer::with_codec(&schema, vec![], Codec::Null)?; - - let value = LongList { - value: 1i64, - next: Some(Box::new(LongList { - value: 2i64, - next: Some(Box::new(LongList { - value: 3i64, - next: Some(Box::new(LongList { - value: 4i64, - next: Some(Box::new(LongList { - value: 5i64, - next: None, - })), - })), - })), - })), - }; - writer.serialize(value)?; - - let buf = writer.into_inner()?; - - // read - let reader = Reader::with_schema(buf.as_slice(), &schema)?; - for i in reader { - let a: LongList = from_value(&i)?; - dbg!(a); - } - - Ok(()) -} diff --git a/examples/writer_builder.rs b/examples/writer_builder.rs deleted file mode 100644 index 2959229..0000000 --- a/examples/writer_builder.rs +++ /dev/null @@ -1,23 +0,0 @@ -use anyhow::Error; -use avrow::{Codec, Reader, Schema, WriterBuilder}; -use std::str::FromStr; - -fn main() -> Result<(), Error> { - let schema = Schema::from_str(r##""null""##)?; - let v = vec![]; - let mut writer = WriterBuilder::new() - .set_codec(Codec::Null) - .set_schema(&schema) - .set_datafile(v) - .set_flush_interval(128_000) - .build()?; - writer.serialize(())?; - let v = writer.into_inner()?; - - let reader = Reader::with_schema(v.as_slice(), &schema)?; - for i in reader { - dbg!(i?); - } - - Ok(()) -} diff --git a/index.html b/index.html new file mode 100644 index 0000000..0d4e57f --- /dev/null +++ b/index.html @@ -0,0 +1,78 @@ + + + + + + + Avrow + + + + + + +
+
+
+ hero +
+
+

Avrow is a pure + Rust implementation + of the avro + specification with Serde support. +

+ + + + +
+ +
+
+ + + diff --git a/rustfmt.toml b/rustfmt.toml deleted file mode 100644 index 27eb93b..0000000 --- a/rustfmt.toml +++ /dev/null @@ -1,2 +0,0 @@ -edition = "2018" -reorder_imports = true \ No newline at end of file diff --git a/src/codec.rs b/src/codec.rs deleted file mode 100644 index 8b6c61d..0000000 --- a/src/codec.rs +++ /dev/null @@ -1,273 +0,0 @@ -use crate::error::AvrowErr; -use crate::util::{encode_long, encode_raw_bytes}; - -use std::io::Write; - -// Given a slice of bytes, generates a CRC for it -#[cfg(feature = "snappy")] -pub fn get_crc_uncompressed(pre_comp_buf: &[u8]) -> Result, AvrowErr> { - use byteorder::{BigEndian, WriteBytesExt}; - use crc::crc32; - - let crc_checksum = crc32::checksum_ieee(pre_comp_buf); - let mut checksum_bytes = Vec::with_capacity(1); - let _ = checksum_bytes - .write_u32::(crc_checksum) - .map_err(|_| { - let err: AvrowErr = AvrowErr::CRCGenFailed; - err - })?; - Ok(checksum_bytes) -} - -/// Given a uncompressed slice of bytes, returns a compresed Vector of bytes using the snappy codec -#[cfg(feature = "snappy")] -pub(crate) fn compress_snappy(uncompressed_buffer: &[u8]) -> Result, AvrowErr> { - let mut encoder = snap::Encoder::new(); - encoder - .compress_vec(uncompressed_buffer) - .map_err(|e| AvrowErr::DecodeFailed(e.into())) -} - -#[cfg(feature = "deflate")] -pub fn compress_deflate(uncompressed_buffer: &[u8]) -> Result, AvrowErr> { - use flate2::{write::DeflateEncoder, Compression}; - - let mut encoder = DeflateEncoder::new(Vec::new(), Compression::default()); - encoder - .write(uncompressed_buffer) - .map_err(AvrowErr::EncodeFailed)?; - encoder.finish().map_err(AvrowErr::EncodeFailed) -} - -#[cfg(feature = "zstd")] -pub(crate) fn zstd_compress(level: i32, uncompressed_buffer: &[u8]) -> Result, AvrowErr> { - let comp = zstdd::encode_all(std::io::Cursor::new(uncompressed_buffer), level) - .map_err(AvrowErr::EncodeFailed)?; - Ok(comp) -} - -#[cfg(feature = "deflate")] -pub fn decompress_deflate( - compressed_buffer: &[u8], - uncompressed: &mut Vec, -) -> Result<(), AvrowErr> { - use flate2::bufread::DeflateDecoder; - use std::io::Read; - - let mut decoder = DeflateDecoder::new(compressed_buffer); - uncompressed.clear(); - decoder - .read_to_end(uncompressed) - .map_err(AvrowErr::DecodeFailed)?; - Ok(()) -} - -#[cfg(feature = "snappy")] -pub(crate) fn decompress_snappy( - compressed_buffer: &[u8], - uncompressed: &mut Vec, -) -> Result<(), AvrowErr> { - use byteorder::ByteOrder; - - let data_minus_cksum = &compressed_buffer[..compressed_buffer.len() - 4]; - let decompressed_size = - snap::decompress_len(data_minus_cksum).map_err(|e| AvrowErr::DecodeFailed(e.into()))?; - uncompressed.resize(decompressed_size, 0); - snap::Decoder::new() - .decompress(data_minus_cksum, &mut uncompressed[..]) - .map_err(|e| AvrowErr::DecodeFailed(e.into()))?; - - let expected = - byteorder::BigEndian::read_u32(&compressed_buffer[compressed_buffer.len() - 4..]); - let found = crc::crc32::checksum_ieee(&uncompressed); - if expected != found { - return Err(AvrowErr::CRCMismatch { found, expected }); - } - Ok(()) -} - -#[cfg(feature = "zstd")] -pub(crate) fn decompress_zstd( - compressed_buffer: &[u8], - uncompressed: &mut Vec, -) -> Result<(), AvrowErr> { - let mut decoder = zstdd::Decoder::new(compressed_buffer).map_err(AvrowErr::DecodeFailed)?; - std::io::copy(&mut decoder, uncompressed).map_err(AvrowErr::DecodeFailed)?; - Ok(()) -} - -#[cfg(feature = "bzip2")] -pub(crate) fn decompress_bzip2( - compressed_buffer: &[u8], - uncompressed: &mut Vec, -) -> Result<(), AvrowErr> { - use bzip2::read::BzDecoder; - let decompressor = BzDecoder::new(compressed_buffer); - let mut buf = decompressor.into_inner(); - std::io::copy(&mut buf, uncompressed).map_err(AvrowErr::DecodeFailed)?; - Ok(()) -} - -#[cfg(feature = "xz")] -pub(crate) fn decompress_xz( - compressed_buffer: &[u8], - uncompressed: &mut Vec, -) -> Result<(), AvrowErr> { - use xz2::read::XzDecoder; - let decompressor = XzDecoder::new(compressed_buffer); - let mut buf = decompressor.into_inner(); - std::io::copy(&mut buf, uncompressed).map_err(AvrowErr::DecodeFailed)?; - Ok(()) -} -/// Defines codecs one can use when writing avro data. -#[derive(Debug, PartialEq, Clone, Copy)] -pub enum Codec { - /// The Null codec. When no codec is specified at the time of Writer creation, null is the default. - Null, - #[cfg(feature = "deflate")] - /// The Deflate codec.
Uses https://docs.rs/flate2 as the underlying implementation. - Deflate, - #[cfg(feature = "snappy")] - /// The Snappy codec.
Uses https://docs.rs/snap as the underlying implementation. - Snappy, - #[cfg(feature = "zstd")] - /// The Zstd codec.
Uses https://docs.rs/zstd as the underlying implementation. - Zstd, - #[cfg(feature = "bzip2")] - /// The Bzip2 codec.
Uses https://docs.rs/bzip2 as the underlying implementation. - Bzip2, - #[cfg(feature = "xz")] - /// The Xz codec.
Uses https://docs.rs/crate/xz2 as the underlying implementation. - Xz, -} - -impl AsRef for Codec { - fn as_ref(&self) -> &str { - match self { - Codec::Null => "null", - #[cfg(feature = "deflate")] - Codec::Deflate => "deflate", - #[cfg(feature = "snappy")] - Codec::Snappy => "snappy", - #[cfg(feature = "zstd")] - Codec::Zstd => "zstd", - #[cfg(feature = "bzip2")] - Codec::Bzip2 => "bzip2", - #[cfg(feature = "xz")] - Codec::Xz => "xz", - } - } -} - -// TODO allow all of these to be configurable for setting compression ratio/level -impl Codec { - pub(crate) fn encode( - &self, - block_stream: &mut [u8], - out_stream: &mut W, - ) -> Result<(), AvrowErr> { - match self { - Codec::Null => { - // encode size of data in block - encode_long(block_stream.len() as i64, out_stream)?; - // encode actual data bytes - encode_raw_bytes(&block_stream, out_stream)?; - } - #[cfg(feature = "snappy")] - Codec::Snappy => { - let checksum_bytes = get_crc_uncompressed(&block_stream)?; - let compressed_data = compress_snappy(&block_stream)?; - encode_long( - compressed_data.len() as i64 + crate::config::CRC_CHECKSUM_LEN as i64, - out_stream, - )?; - - out_stream - .write(&*compressed_data) - .map_err(AvrowErr::EncodeFailed)?; - out_stream - .write(&*checksum_bytes) - .map_err(AvrowErr::EncodeFailed)?; - } - #[cfg(feature = "deflate")] - Codec::Deflate => { - let compressed_data = compress_deflate(block_stream)?; - encode_long(compressed_data.len() as i64, out_stream)?; - encode_raw_bytes(&*compressed_data, out_stream)?; - } - #[cfg(feature = "zstd")] - Codec::Zstd => { - let compressed_data = zstd_compress(0, block_stream)?; - encode_long(compressed_data.len() as i64, out_stream)?; - encode_raw_bytes(&*compressed_data, out_stream)?; - } - #[cfg(feature = "bzip2")] - Codec::Bzip2 => { - use bzip2::read::BzEncoder; - use bzip2::Compression; - use std::io::Cursor; - let compressor = BzEncoder::new(Cursor::new(block_stream), Compression::new(5)); - let vec = compressor.into_inner().into_inner(); - - encode_long(vec.len() as i64, out_stream)?; - encode_raw_bytes(&*vec, out_stream)?; - } - #[cfg(feature = "xz")] - Codec::Xz => { - use std::io::Cursor; - use xz2::read::XzEncoder; - let compressor = XzEncoder::new(Cursor::new(block_stream), 6); - let vec = compressor.into_inner().into_inner(); - - encode_long(vec.len() as i64, out_stream)?; - encode_raw_bytes(&*vec, out_stream)?; - } - } - Ok(()) - } - - pub(crate) fn decode( - &self, - compressed: Vec, - uncompressed: &mut std::io::Cursor>, - ) -> Result<(), AvrowErr> { - match self { - Codec::Null => { - *uncompressed = std::io::Cursor::new(compressed); - Ok(()) - } - #[cfg(feature = "snappy")] - Codec::Snappy => decompress_snappy(&compressed, uncompressed.get_mut()), - #[cfg(feature = "deflate")] - Codec::Deflate => decompress_deflate(&compressed, uncompressed.get_mut()), - #[cfg(feature = "zstd")] - Codec::Zstd => decompress_zstd(&compressed, uncompressed.get_mut()), - #[cfg(feature = "bzip2")] - Codec::Bzip2 => decompress_bzip2(&compressed, uncompressed.get_mut()), - #[cfg(feature = "xz")] - Codec::Xz => decompress_xz(&compressed, uncompressed.get_mut()), - } - } -} - -impl std::convert::TryFrom<&str> for Codec { - type Error = AvrowErr; - - fn try_from(value: &str) -> Result { - match value { - "null" => Ok(Codec::Null), - #[cfg(feature = "snappy")] - "snappy" => Ok(Codec::Snappy), - #[cfg(feature = "deflate")] - "deflate" => Ok(Codec::Deflate), - #[cfg(feature = "zstd")] - "zstd" => Ok(Codec::Zstd), - #[cfg(feature = "bzip2")] - "bzip2" => Ok(Codec::Bzip2), - #[cfg(feature = "xz")] - "xz" => Ok(Codec::Xz), - o => Err(AvrowErr::UnsupportedCodec(o.to_string())), - } - } -} diff --git a/src/config.rs b/src/config.rs deleted file mode 100644 index b60a74c..0000000 --- a/src/config.rs +++ /dev/null @@ -1,15 +0,0 @@ -//! This module contains constants and configuration parameters for configuring avro writers and readers. - -/// Synchronization marker bytes length, defaults to 16 bytes. -pub const SYNC_MARKER_SIZE: usize = 16; -/// The magic header for recognizing a file as an avro data file. -pub const MAGIC_BYTES: &[u8] = b"Obj\x01"; -/// Checksum length for snappy compressed data. -#[cfg(feature = "snappy")] -pub const CRC_CHECKSUM_LEN: usize = 4; -/// Minimum flush interval that a block can have. -pub const BLOCK_SIZE: usize = 4096; -/// This value defines the threshold post which the scratch buffer is -/// is flushed/synced to the main buffer. Suggested values are between 2K (bytes) and 2M -// TODO make this configurable -pub const DEFAULT_FLUSH_INTERVAL: usize = 16 * BLOCK_SIZE; diff --git a/src/error.rs b/src/error.rs deleted file mode 100644 index f264f32..0000000 --- a/src/error.rs +++ /dev/null @@ -1,184 +0,0 @@ -#![allow(missing_docs)] - -use serde::{de, ser}; -use std::fmt::Debug; -use std::fmt::Display; -use std::io::{Error, ErrorKind}; - -#[inline(always)] -pub(crate) fn io_err(msg: &str) -> Error { - Error::new(ErrorKind::Other, msg) -} - -// Required impls for Serde -impl ser::Error for AvrowErr { - fn custom(msg: T) -> Self { - Self::Message(msg.to_string()) - } -} - -impl de::Error for AvrowErr { - fn custom(msg: T) -> Self { - Self::Message(msg.to_string()) - } -} - -pub type AvrowResult = Result; - -/// Errors returned from avrow -#[derive(thiserror::Error, Debug)] -pub enum AvrowErr { - // Encode errors - #[error("Write failed")] - EncodeFailed(#[source] std::io::Error), - #[error("Encoding failed. Value does not match schema")] - SchemaDataMismatch, - #[error("Expected magic header: `Obj\n`")] - InvalidDataFile, - #[error("Sync marker does not match as expected")] - SyncMarkerMismatch, - #[error("Named schema not found in union")] - SchemaNotFoundInUnion, - #[error("Invalid field value: {0}")] - InvalidFieldValue(String), - #[error("Writer seek failed, not a valid avro data file")] - WriterSeekFailed, - #[error("Unions must not contain immediate union values")] - NoImmediateUnion, - #[error("Failed building the Writer")] - WriterBuildFailed, - #[error("Json must be an object for record")] - ExpectedJsonObject, - - // Decode errors - #[error("Read failed")] - DecodeFailed(#[source] std::io::Error), - #[error("failed reading `avro.schema` metadata from header")] - HeaderDecodeFailed, - #[error("Unsupported codec {0}, did you enable the feature?")] - UnsupportedCodec(String), - #[error("Named schema was not found in schema registry")] - NamedSchemaNotFound, - #[error("Schema resolution failed. reader's schema {0} != writer's schema {1}")] - SchemaResolutionFailed(String, String), - #[error("Index read for enum is out of range as per schema. got: {0} symbols: {1}")] - InvalidEnumSymbolIdx(usize, String), - #[error("Field not found in record")] - FieldNotFound, - #[error("Writer schema not found in reader's schema")] - WriterNotInReader, - #[error("Reader's union schema does not match with writer's union schema")] - UnionSchemaMismatch, - #[error("Map's value schema do not match")] - MapSchemaMismatch, - #[error("Fixed schema names do not match")] - FixedSchemaNameMismatch, - #[error("Could not find symbol at index {idx} in reader schema")] - EnumSymbolNotFound { idx: usize }, - #[error("Reader's enum name does not match writer's enum name")] - EnumNameMismatch, - #[error("Readers' record name does not match writer's record name")] - RecordNameMismatch, - #[error("Array items schema does not match")] - ArrayItemsMismatch, - #[error("Snappy decoder failed to get length of decompressed buffer")] - SnappyDecompressLenFailed, - #[error("End of file reached")] - Eof, - - // Schema parse errors - #[error("Failed to parse avro schema")] - SchemaParseErr(#[source] std::io::Error), - #[error("Unknown schema, expecting a required `type` field in schema")] - SchemaParseFailed, - #[error("Expecting fields key as a json array, found: {0}")] - SchemaFieldParseErr(String), - #[error("Expected: {0}, found: {1}")] - SchemaDataValidationFailed(String, String), - #[error("Schema has a field not found in the value")] - RecordFieldMissing, - #[error("Record schema does not a have a required field named `name`")] - RecordNameNotFound, - #[error("Record schema does not a have a required field named `type`")] - RecordTypeNotFound, - #[error("Expected record field to be a json array")] - ExpectedFieldsJsonArray, - #[error("Record's field json schema must be an object")] - InvalidRecordFieldType, - #[error("{0}")] - ParseFieldOrderErr(String), - #[error("Could not parse name from json value")] - NameParseFailed, - #[error("Parsing canonical form failed")] - ParsingCanonicalForm, - #[error("Duplicate definition of named schema")] - DuplicateSchema, - #[error("Duplicate field name in record schema")] - DuplicateField, - #[error("Invalid default value for union. Must be the first entry from union definition")] - FailedDefaultUnion, - #[error("Invalid default value for given schema")] - DefaultValueParse, - #[error("Unknown field ordering value.")] - UnknownFieldOrdering, - #[error("Field ordering value must be a string")] - InvalidFieldOrdering, - #[error("Failed to parse symbol from enum's symbols field")] - EnumSymbolParseErr, - #[error("Enum schema must contain required `symbols` field")] - EnumSymbolsMissing, - #[error("Enum value symbol not present in enum schema `symbols` field")] - EnumSymbolNotPresent, - #[error("Fixed schema `size` field must be a number")] - FixedSizeNotNumber, - #[error("Fixed schema `size` field missing")] - FixedSizeNotFound, - #[error("Unions cannot have multiple schemas of same type or immediate unions")] - DuplicateSchemaInUnion, - #[error("Expected the avro schema to be as one of json string, object or an array")] - UnknownSchema, - #[error("Expected record field to be a json object, found {0}")] - InvalidSchema(String), - #[error("Invalid type for {0}")] - InvalidType(String), - #[error("Enum schema parsing failed, found: {0}")] - EnumParseErr(String), - #[error("Primitve schema must be a string")] - InvalidPrimitiveSchema, - - // Validation errors - #[error("Mismatch in fixed bytes length: {found}, {expected}")] - FixedValueLenMismatch { found: usize, expected: usize }, - #[error("namespaces must either be empty or follow the grammer [()*")] - InvalidNamespace, - #[error("Field name must be [A-Za-z_] and subsequently contain only [A-Za-z0-9_]")] - InvalidName, - #[error("Array value is empty")] - EmptyArray, - #[error("Map value is empty")] - EmptyMap, - #[error("Crc generation failed")] - CRCGenFailed, - #[error("Snappy Crc mismatch")] - CRCMismatch { found: u32, expected: u32 }, - #[error("Named schema was not found for given value")] - NamedSchemaNotFoundForValue, - #[error("Value schema not found in union")] - NotFoundInUnion, - - // Serde specific errors - #[error("Serde error: {0}")] - Message(String), - #[error("Syntax error occured")] - Syntax, - #[error("Expected a string value")] - ExpectedString, - #[error("Unsupported type")] - Unsupported, - #[error("Unexpected avro value: {value}")] - UnexpectedAvroValue { value: String }, - - // Value errors - #[error("Expected value not found in variant instance")] - ExpectedVariantNotFound, -} diff --git a/src/lib.rs b/src/lib.rs deleted file mode 100644 index c00d28a..0000000 --- a/src/lib.rs +++ /dev/null @@ -1,84 +0,0 @@ -//! Avrow is a pure Rust implementation of the [Apache Avro specification](https://avro.apache.org/docs/current/spec.html). -//! -//! Please refer to the [README](https://github.com/creativcoder/avrow/blob/main/README.md) for an overview. -//! For more details on the spec, head over to the [FAQ](https://cwiki.apache.org/confluence/display/AVRO/FAQ). -//! -//! ## Using the library -//! -//! Add avrow to your `Cargo.toml`: -//!```toml -//! [dependencies] -//! avrow = "0.2.1" -//!``` -//! ## A hello world example of reading and writing avro data files - -//!```rust -//! use avrow::{Reader, Schema, Writer, from_value}; -//! use std::str::FromStr; -//! use anyhow::Error; -//! -//! fn main() -> Result<(), Error> { -//! // Writing data -//! -//! // Create a schema -//! let schema = Schema::from_str(r##""null""##)?; -//! // Create writer using schema and provide a buffer to write to -//! let mut writer = Writer::new(&schema, vec![])?; -//! // Write data using write -//! writer.write(())?; -//! // or serialize via serde -//! writer.serialize(())?; -//! // retrieve the underlying buffer using the into_inner method. -//! let buf = writer.into_inner()?; -//! -//! // Reading data -//! -//! // Create Reader by providing a Read wrapped version of `buf` -//! let reader = Reader::new(buf.as_slice())?; -//! // Use iterator for reading data in an idiomatic manner. -//! for i in reader { -//! // reading values can fail due to decoding errors, so the return value of iterator is a Option> -//! // it allows one to examine the failure reason on the underlying avro reader. -//! dbg!(&i); -//! // This value can be converted to a native Rust type using from_value method from the serde impl. -//! let _: () = from_value(&i)?; -//! } -//! -//! Ok(()) -//! } - -//!``` - -#![doc( - html_favicon_url = "https://raw.githubusercontent.com/creativcoder/avrow/main/assets/avrow_logo.png" -)] -#![doc( - html_logo_url = "https://raw.githubusercontent.com/creativcoder/avrow/main/assets/avrow_logo.png" -)] -#![deny(missing_docs)] -#![recursion_limit = "1024"] -#![deny(unused_must_use)] -#![deny(rust_2018_idioms)] -#![deny(warnings)] - -mod codec; -pub mod config; -mod error; -mod reader; -mod schema; -mod serde_avro; -mod util; -mod value; -mod writer; - -pub use codec::Codec; -pub use error::AvrowErr; -pub use reader::from_value; -pub use reader::Header; -pub use reader::Reader; -pub use schema::Schema; -pub use serde_avro::to_value; -pub use value::Record; -pub use value::Value; -pub use writer::Writer; -pub use writer::WriterBuilder; diff --git a/src/reader.rs b/src/reader.rs deleted file mode 100644 index aeed588..0000000 --- a/src/reader.rs +++ /dev/null @@ -1,709 +0,0 @@ -use crate::codec::Codec; -use crate::config::DEFAULT_FLUSH_INTERVAL; -use crate::error; -use crate::schema; -use crate::serde_avro; -use crate::util::{decode_bytes, decode_string}; -use crate::value; -use byteorder::LittleEndian; -use byteorder::ReadBytesExt; -use error::AvrowErr; -use indexmap::IndexMap; -use integer_encoding::VarIntReader; -use schema::Registry; -use schema::Schema; -use schema::Variant; -use serde::Deserialize; -use serde_avro::SerdeReader; -use std::collections::HashMap; -use std::convert::TryFrom; -use std::io::Cursor; -use std::io::Read; -use std::io::{Error, ErrorKind}; -use std::str; -use std::str::FromStr; -use value::{FieldValue, Record, Value}; - -/// Reader is the primary interface for reading data from an avro datafile. -pub struct Reader<'a, R> { - source: R, - header: Header, - reader_schema: Option<&'a Schema>, - block_buffer: Cursor>, - entries_in_block: u64, -} - -impl<'a, R> Reader<'a, R> -where - R: Read, -{ - /// Creates a Reader from an avro encoded readable buffer. - pub fn new(mut source: R) -> Result { - let header = Header::from_reader(&mut source)?; - Ok(Reader { - source, - header, - reader_schema: None, - block_buffer: Cursor::new(vec![0u8; DEFAULT_FLUSH_INTERVAL]), - entries_in_block: 0, - }) - } - - /// Create a Reader with the given reader schema and a readable buffer. - pub fn with_schema(mut source: R, reader_schema: &'a Schema) -> Result { - let header = Header::from_reader(&mut source)?; - - Ok(Reader { - source, - header, - reader_schema: Some(reader_schema), - block_buffer: Cursor::new(vec![0u8; DEFAULT_FLUSH_INTERVAL]), - entries_in_block: 0, - }) - } - - // TODO optimize based on benchmarks - fn next_block(&mut self) -> Result<(), std::io::Error> { - // if no more bytes to read, read_varint below returns an EOF - let entries_in_block: i64 = self.source.read_varint()?; - self.entries_in_block = entries_in_block as u64; - - let block_stream_len: i64 = self.source.read_varint()?; - - let mut compressed_block = vec![0u8; block_stream_len as usize]; - self.source.read_exact(&mut compressed_block)?; - - self.header - .codec - .decode(compressed_block, &mut self.block_buffer) - .map_err(|e| { - Error::new( - ErrorKind::Other, - format!("Failed decoding block data with codec, {:?}", e), - ) - })?; - - // Ready for reading from block - self.block_buffer.set_position(0); - - let mut sync_marker_buf = [0u8; 16]; - let _ = self.source.read_exact(&mut sync_marker_buf); - - if sync_marker_buf != self.header.sync_marker { - let err = Error::new( - ErrorKind::Other, - "Sync marker does not match as expected while reading", - ); - return Err(err); - } - - Ok(()) - } - - /// Retrieves a reference to the header metadata map. - pub fn meta(&self) -> &HashMap> { - self.header.metadata() - } -} - -/// `from_value` is the serde API for deserialization of avro encoded data to native Rust types. -pub fn from_value<'de, D: Deserialize<'de>>( - value: &'de Result, -) -> Result { - match value { - Ok(v) => { - let mut serde_reader = SerdeReader::new(v); - D::deserialize(&mut serde_reader) - } - Err(e) => Err(AvrowErr::UnexpectedAvroValue { - value: e.to_string(), - }), - } -} - -impl<'a, 's, R: Read> Iterator for Reader<'_, R> { - type Item = Result; - - fn next(&mut self) -> Option { - // invariant: True on start and end of an avro datafile - if self.entries_in_block == 0 { - if let Err(e) = self.next_block() { - // marks the end of the avro datafile - if let std::io::ErrorKind::UnexpectedEof = e.kind() { - return None; - } else { - return Some(Err(AvrowErr::DecodeFailed(e))); - } - } - } - - let writer_schema = &self.header.schema; - let w_cxt = &writer_schema.cxt; - let reader_schema = &self.reader_schema; - let value = if let Some(r_schema) = reader_schema { - let r_cxt = &r_schema.cxt; - decode_with_resolution( - &r_schema.variant, - &writer_schema.variant, - &r_cxt, - &w_cxt, - &mut self.block_buffer, - ) - } else { - // decode without the reader schema - decode(&writer_schema.variant, &mut self.block_buffer, &w_cxt) - }; - - self.entries_in_block -= 1; - - if let Err(e) = value { - return Some(Err(e)); - } - - Some(value) - } -} - -// Reads places priority on reader's schema when passing any schema context if a reader schema is provided. -pub(crate) fn decode_with_resolution( - r_schema: &Variant, - w_schema: &Variant, - r_cxt: &Registry, - w_cxt: &Registry, - reader: &mut R, -) -> Result { - // LHS: Writer schema, RHS: Reader schema - let value = match (w_schema, r_schema) { - (Variant::Null, Variant::Null) => Value::Null, - (Variant::Boolean, Variant::Boolean) => { - let mut buf = [0u8; 1]; - reader - .read_exact(&mut buf) - .map_err(AvrowErr::DecodeFailed)?; - match buf { - [0x00] => Value::Boolean(false), - [0x01] => Value::Boolean(true), - _o => { - return Err(AvrowErr::DecodeFailed(Error::new( - ErrorKind::InvalidData, - "expecting a 0x00 or 0x01 as a byte for boolean value", - ))) - } - } - } - (Variant::Int, Variant::Int) => { - Value::Int(reader.read_varint().map_err(AvrowErr::DecodeFailed)?) - } - // int is promotable to long, float, or double (we read as int and cast to promotable.) - (Variant::Int, Variant::Long) => Value::Long( - reader - .read_varint::() - .map_err(AvrowErr::DecodeFailed)? as i64, - ), - (Variant::Int, Variant::Float) => Value::Float( - reader - .read_varint::() - .map_err(AvrowErr::DecodeFailed)? as f32, - ), - (Variant::Int, Variant::Double) => Value::Double( - reader - .read_varint::() - .map_err(AvrowErr::DecodeFailed)? as f64, - ), - (Variant::Long, Variant::Long) => { - Value::Long(reader.read_varint().map_err(AvrowErr::DecodeFailed)?) - } - // long is promotable to float or double - (Variant::Long, Variant::Float) => Value::Float( - reader - .read_varint::() - .map_err(AvrowErr::DecodeFailed)? as f32, - ), - (Variant::Long, Variant::Double) => Value::Double( - reader - .read_varint::() - .map_err(AvrowErr::DecodeFailed)? as f64, - ), - (Variant::Float, Variant::Float) => Value::Float( - reader - .read_f32::() - .map_err(AvrowErr::DecodeFailed)?, - ), - (Variant::Double, Variant::Double) => Value::Double( - reader - .read_f64::() - .map_err(AvrowErr::DecodeFailed)?, - ), - // float is promotable to double - (Variant::Float, Variant::Double) => Value::Double( - reader - .read_f32::() - .map_err(AvrowErr::DecodeFailed)? as f64, - ), - (Variant::Bytes, Variant::Bytes) => Value::Bytes(decode_bytes(reader)?), - // bytes is promotable to string - (Variant::Bytes, Variant::Str) => { - let bytes = decode_bytes(reader)?; - let s = str::from_utf8(&bytes).map_err(|_e| { - let err = Error::new(ErrorKind::InvalidData, "failed converting bytes to string"); - AvrowErr::DecodeFailed(err) - })?; - - Value::Str(s.to_string()) - } - (Variant::Str, Variant::Str) => { - let buf = decode_bytes(reader)?; - let s = str::from_utf8(&buf).map_err(|_e| { - let err = Error::new(ErrorKind::InvalidData, "failed converting bytes to string"); - AvrowErr::DecodeFailed(err) - })?; - Value::Str(s.to_string()) - } - // string is promotable to bytes - (Variant::Str, Variant::Bytes) => { - let buf = decode_bytes(reader)?; - Value::Bytes(buf) - } - (Variant::Array { items: w_items }, Variant::Array { items: r_items }) => { - if w_items == r_items { - let block_count: i64 = reader.read_varint().map_err(AvrowErr::DecodeFailed)?; - let mut v = Vec::with_capacity(block_count as usize); - - for _ in 0..block_count { - let decoded = - decode_with_resolution(&*r_items, &*w_items, r_cxt, w_cxt, reader)?; - v.push(decoded); - } - - Value::Array(v) - } else { - return Err(AvrowErr::ArrayItemsMismatch); - } - } - // Resolution rules - // if both are records: - // * The ordering of fields may be different: fields are matched by name. [1] - // * Schemas for fields with the same name in both records are resolved recursively. [2] - // * If the writer's record contains a field with a name not present in the reader's record, - // the writer's value for that field is ignored. [3] - // * If the reader's record schema has a field that contains a default value, - // and writer's schema does not have a field with the same name, - // then the reader should use the default value from its field. [4] - // * If the reader's record schema has a field with no default value, - // and writer's schema does not have a field with the same name, an error is signalled. [5] - ( - Variant::Record { - name: writer_name, - fields: writer_fields, - .. - }, - Variant::Record { - name: reader_name, - fields: reader_fields, - .. - }, - ) => { - // [1] - let reader_name = reader_name.fullname(); - let writer_name = writer_name.fullname(); - if writer_name != reader_name { - return Err(AvrowErr::RecordNameMismatch); - } - - let mut rec = Record::new(&reader_name); - for f in reader_fields { - let reader_fieldname = f.0.as_str(); - let reader_field = f.1; - // [3] - if let Some(wf) = writer_fields.get(reader_fieldname) { - // [2] - let f_decoded = - decode_with_resolution(&reader_field.ty, &wf.ty, r_cxt, w_cxt, reader)?; - rec.insert(&reader_fieldname, f_decoded)?; - } else { - // [4] - let default_field = f.1; - if let Some(a) = &default_field.default { - rec.insert(&reader_fieldname, a.clone())?; - } else { - // [5] - return Err(AvrowErr::FieldNotFound); - } - } - } - - return Ok(Value::Record(rec)); - } - ( - Variant::Enum { - name: w_name, - symbols: w_symbols, - .. - }, - Variant::Enum { - name: r_name, - symbols: r_symbols, - .. - }, - ) => { - if w_name.fullname() != r_name.fullname() { - return Err(AvrowErr::EnumNameMismatch); - } - - let idx: i32 = reader.read_varint().map_err(AvrowErr::DecodeFailed)?; - let idx = idx as usize; - if idx >= w_symbols.len() { - return Err(AvrowErr::InvalidEnumSymbolIdx( - idx, - format!("{:?}", w_symbols), - )); - } - - let symbol = r_symbols.get(idx as usize); - if let Some(s) = symbol { - return Ok(Value::Enum(s.to_string())); - } else { - return Err(AvrowErr::EnumSymbolNotFound { idx }); - } - } - ( - Variant::Fixed { - name: w_name, - size: w_size, - }, - Variant::Fixed { - name: r_name, - size: r_size, - }, - ) => { - if w_name.fullname() != r_name.fullname() && w_size != r_size { - return Err(AvrowErr::FixedSchemaNameMismatch); - } else { - let mut fixed = vec![0u8; *r_size]; - reader - .read_exact(&mut fixed) - .map_err(AvrowErr::DecodeFailed)?; - Value::Fixed(fixed) - } - } - ( - Variant::Map { - values: writer_values, - }, - Variant::Map { - values: reader_values, - }, - ) => { - // here equality will be based - if writer_values == reader_values { - let block_count: i32 = reader.read_varint().map_err(AvrowErr::DecodeFailed)?; - let mut hm = HashMap::new(); - for _ in 0..block_count { - let key = decode_string(reader)?; - let value = decode(reader_values, reader, r_cxt)?; - hm.insert(key, value); - } - Value::Map(hm) - } else { - return Err(AvrowErr::MapSchemaMismatch); - } - } - ( - Variant::Union { - variants: writer_variants, - }, - Variant::Union { - variants: reader_variants, - }, - ) => { - let union_idx: i32 = reader.read_varint().map_err(AvrowErr::DecodeFailed)?; - if let Some(writer_schema) = writer_variants.get(union_idx as usize) { - for i in reader_variants { - if i == writer_schema { - return decode(i, reader, r_cxt); - } - } - } - - return Err(AvrowErr::UnionSchemaMismatch); - } - /* - if reader's is a union but writer's is not. The first schema in the reader's union that matches - the writer's schema is recursively resolved against it. If none match, an error is signalled. - */ - ( - writer_schema, - Variant::Union { - variants: reader_variants, - }, - ) => { - for i in reader_variants { - if i == writer_schema { - return decode(i, reader, r_cxt); - } - } - - return Err(AvrowErr::WriterNotInReader); - } - /* - if writer's schema is a union, but reader's is not. - If the reader's schema matches the selected writer's schema, - it is recursively resolved against it. If they do not match, an error is signalled. - */ - ( - Variant::Union { - variants: writer_variants, - }, - reader_schema, - ) => { - // Read the index value in the schema - let union_idx: i32 = reader.read_varint().map_err(AvrowErr::DecodeFailed)?; - let schema = writer_variants.get(union_idx as usize); - if let Some(s) = schema { - if s == reader_schema { - return decode(reader_schema, reader, r_cxt); - } - } - let writer_schema = format!("writer schema: {:?}", writer_variants); - let reader_schema = format!("reader schema: {:?}", reader_schema); - return Err(AvrowErr::SchemaResolutionFailed( - reader_schema, - writer_schema, - )); - } - other => { - return Err(AvrowErr::SchemaResolutionFailed( - format!("{:?}", other.0), - format!("{:?}", other.1), - )) - } - }; - - Ok(value) -} - -pub(crate) fn decode( - schema: &Variant, - reader: &mut R, - w_cxt: &Registry, -) -> Result { - let value = match schema { - Variant::Null => Value::Null, - Variant::Boolean => { - let mut buf = [0u8; 1]; - reader - .read_exact(&mut buf) - .map_err(AvrowErr::DecodeFailed)?; - match buf { - [0x00] => Value::Boolean(false), - [0x01] => Value::Boolean(true), - _ => { - return Err(AvrowErr::DecodeFailed(Error::new( - ErrorKind::InvalidData, - "Invalid boolean value, expected a 0x00 or a 0x01", - ))) - } - } - } - Variant::Int => Value::Int(reader.read_varint().map_err(AvrowErr::DecodeFailed)?), - Variant::Double => Value::Double( - reader - .read_f64::() - .map_err(AvrowErr::DecodeFailed)?, - ), - Variant::Long => Value::Long(reader.read_varint().map_err(AvrowErr::DecodeFailed)?), - Variant::Float => Value::Float( - reader - .read_f32::() - .map_err(AvrowErr::DecodeFailed)?, - ), - Variant::Str => { - let buf = decode_bytes(reader)?; - let s = str::from_utf8(&buf).map_err(|_e| { - let err = Error::new( - ErrorKind::InvalidData, - "failed converting from bytes to string", - ); - AvrowErr::DecodeFailed(err) - })?; - Value::Str(s.to_string()) - } - Variant::Array { items } => { - let block_count: i64 = reader.read_varint().map_err(AvrowErr::DecodeFailed)?; - - if block_count == 0 { - // FIXME do we send an empty array? - return Ok(Value::Array(Vec::new())); - } - - let mut it = Vec::with_capacity(block_count as usize); - for _ in 0..block_count { - let decoded = decode(&**items, reader, w_cxt)?; - it.push(decoded); - } - - Value::Array(it) - } - Variant::Bytes => Value::Bytes(decode_bytes(reader)?), - Variant::Map { values } => { - let block_count: usize = reader.read_varint().map_err(AvrowErr::DecodeFailed)?; - let mut hm = HashMap::new(); - for _ in 0..block_count { - let key = decode_string(reader)?; - let value = decode(values, reader, w_cxt)?; - hm.insert(key, value); - } - - Value::Map(hm) - } - Variant::Record { name, fields, .. } => { - let mut v = IndexMap::with_capacity(fields.len()); - for (field_name, field) in fields { - let field_name = field_name.to_string(); - let field_value = decode(&field.ty, reader, w_cxt)?; - let field_value = FieldValue::new(field_value); - v.insert(field_name, field_value); - } - - let rec = Record { - name: name.fullname(), - fields: v, - }; - Value::Record(rec) - } - Variant::Fixed { size, .. } => { - let mut buf = vec![0; *size]; - reader - .read_exact(&mut buf) - .map_err(AvrowErr::DecodeFailed)?; - Value::Fixed(buf) - } - Variant::Union { variants } => { - let variant_idx: i64 = reader.read_varint().map_err(AvrowErr::DecodeFailed)?; - decode(&variants[variant_idx as usize], reader, w_cxt)? - } - Variant::Named(schema_name) => { - let schema_variant = w_cxt - .get(schema_name) - .ok_or(AvrowErr::NamedSchemaNotFound)?; - decode(schema_variant, reader, w_cxt)? - } - a => { - return Err(AvrowErr::DecodeFailed(Error::new( - ErrorKind::InvalidData, - format!("Read failed for schema {:?}", a), - ))) - } - }; - - Ok(value) -} - -/// Header represents the avro datafile header. -#[derive(Debug)] -pub struct Header { - /// Writer's schema - pub(crate) schema: Schema, - /// A Map which stores avro metadata, like `avro.codec` and `avro.schema`. - /// Additional key values can be added through the - /// [WriterBuilder](struct.WriterBuilder.html)'s `set_metadata` method. - pub(crate) metadata: HashMap>, - /// A unique 16 byte sequence for file integrity when writing avro data to file. - pub(crate) sync_marker: [u8; 16], - /// codec parsed from the datafile - pub(crate) codec: Codec, -} - -fn decode_header_map(reader: &mut R) -> Result>, AvrowErr> -where - R: Read, -{ - let count: i64 = reader.read_varint().map_err(AvrowErr::DecodeFailed)?; - let count = count as usize; - let mut map = HashMap::with_capacity(count); - - for _ in 0..count { - let key = decode_string(reader)?; - let val = decode_bytes(reader)?; - map.insert(key, val); - } - - let _map_end: i64 = reader.read_varint().map_err(AvrowErr::DecodeFailed)?; - - Ok(map) -} - -impl Header { - /// Reads the header from an avro datafile - pub fn from_reader(reader: &mut R) -> Result { - let mut magic_buf = [0u8; 4]; - reader - .read_exact(&mut magic_buf[..]) - .map_err(|_| AvrowErr::HeaderDecodeFailed)?; - - if &magic_buf != b"Obj\x01" { - return Err(AvrowErr::InvalidDataFile); - } - - let map = decode_header_map(reader)?; - - let mut sync_marker = [0u8; 16]; - let _ = reader - .read_exact(&mut sync_marker) - .map_err(|_| AvrowErr::HeaderDecodeFailed)?; - - let schema_bytes = map.get("avro.schema").ok_or(AvrowErr::HeaderDecodeFailed)?; - - let schema = str::from_utf8(schema_bytes) - .map(Schema::from_str) - .map_err(|_| AvrowErr::HeaderDecodeFailed)??; - - let codec = if let Some(c) = map.get("avro.codec") { - match std::str::from_utf8(c) { - Ok(s) => Codec::try_from(s)?, - Err(s) => return Err(AvrowErr::UnsupportedCodec(s.to_string())), - } - } else { - Codec::Null - }; - - let header = Header { - schema, - metadata: map, - sync_marker, - codec, - }; - - Ok(header) - } - - /// Returns a reference to metadata from avro datafile header - pub fn metadata(&self) -> &HashMap> { - &self.metadata - } - - /// Returns a reference to the writer's schema in this header - pub fn schema(&self) -> &Schema { - &self.schema - } -} - -#[cfg(test)] -mod tests { - use crate::Reader; - #[test] - fn has_required_headers() { - let data = vec![ - 79, 98, 106, 1, 4, 22, 97, 118, 114, 111, 46, 115, 99, 104, 101, 109, 97, 32, 123, 34, - 116, 121, 112, 101, 34, 58, 34, 98, 121, 116, 101, 115, 34, 125, 20, 97, 118, 114, 111, - 46, 99, 111, 100, 101, 99, 14, 100, 101, 102, 108, 97, 116, 101, 0, 145, 85, 112, 15, - 87, 201, 208, 26, 183, 148, 48, 236, 212, 250, 38, 208, 2, 18, 227, 97, 96, 100, 98, - 102, 97, 5, 0, 145, 85, 112, 15, 87, 201, 208, 26, 183, 148, 48, 236, 212, 250, 38, - 208, - ]; - - let reader = Reader::new(data.as_slice()).unwrap(); - assert!(reader.meta().contains_key("avro.codec")); - assert!(reader.meta().contains_key("avro.schema")); - } -} diff --git a/src/schema/canonical.rs b/src/schema/canonical.rs deleted file mode 100644 index a6614be..0000000 --- a/src/schema/canonical.rs +++ /dev/null @@ -1,258 +0,0 @@ -use crate::schema::Name; -use crate::serde_avro::AvrowErr; -use serde_json::json; -use serde_json::Value as JsonValue; -use std::cmp::PartialEq; - -// wrap overflow of 0xc15d213aa4d7a795 -const EMPTY: i64 = -4513414715797952619; - -static FP_TABLE: once_cell::sync::Lazy<[i64; 256]> = { - use once_cell::sync::Lazy; - Lazy::new(|| { - let mut fp_table: [i64; 256] = [0; 256]; - for i in 0..256 { - let mut fp = i; - for _ in 0..8 { - fp = (fp as u64 >> 1) as i64 ^ (EMPTY & -(fp & 1)); - } - fp_table[i as usize] = fp; - } - fp_table - }) -}; - -// relevant fields and in order fields according to spec -const RELEVANT_FIELDS: [&str; 7] = [ - "name", "type", "fields", "symbols", "items", "values", "size", -]; -/// Represents canonical form of an avro schema. This representation removes irrelevant fields -/// such as docs and aliases in the schema. Fingerprinting methods are available on this instance. -#[derive(Debug, PartialEq)] -pub struct CanonicalSchema(pub(crate) JsonValue); - -impl std::fmt::Display for CanonicalSchema { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let c = serde_json::to_string_pretty(&self.0); - write!(f, "{}", c.map_err(|_| std::fmt::Error)?) - } -} - -impl CanonicalSchema { - #[cfg(feature = "sha2")] - pub fn sha256(&self) -> Vec { - use shatwo::{Digest, Sha256}; - let mut hasher = Sha256::new(); - hasher.update(self.0.to_string()); - let result = hasher.finalize(); - result.to_vec() - } - - #[cfg(feature = "md5")] - pub fn md5(&self) -> Vec { - let v = mdfive::compute(self.0.to_string().as_bytes()); - v.to_vec() - } - - pub fn rabin64(&self) -> i64 { - let buf = self.0.to_string(); - let buf = buf.as_bytes(); - let mut fp: i64 = EMPTY; - - buf.iter().for_each(|b| { - let idx = ((fp ^ *b as i64) & 0xff) as usize; - fp = (fp as u64 >> 8) as i64 ^ FP_TABLE[idx]; - }); - - fp - } -} - -// TODO unescape \uXXXX -// pub fn normalize_unescape(s: &str) -> &str { -// s -// } - -// [FULLNAMES] - traverse the `type` field and replace names with fullnames -pub fn normalize_name( - json_map: &mut serde_json::map::Map, - enclosing_namespace: Option<&str>, -) -> Result<(), AvrowErr> { - let name = Name::from_json_mut(json_map, enclosing_namespace)?; - - json_map["name"] = json!(name.fullname()); - - if let Some(JsonValue::Array(fields)) = json_map.get_mut("fields") { - for f in fields.iter_mut() { - if let JsonValue::Object(ref mut o) = f { - if let Some(JsonValue::Object(ref mut o)) = o.get_mut("type") { - if o.contains_key("name") { - normalize_name(o, name.namespace())?; - } - } - } - } - } - - Ok(()) -} - -// [STRIP] -pub fn normalize_strip( - schema: &mut serde_json::map::Map, -) -> Result<(), AvrowErr> { - if schema.contains_key("doc") { - schema.remove("doc").ok_or(AvrowErr::ParsingCanonicalForm)?; - } - if schema.contains_key("aliases") { - schema - .remove("aliases") - .ok_or(AvrowErr::ParsingCanonicalForm)?; - } - - Ok(()) -} - -type JsonMap = serde_json::map::Map; - -pub fn order_fields(json: &JsonMap) -> Result { - let mut ordered = JsonMap::new(); - - for field in RELEVANT_FIELDS.iter() { - if let Some(value) = json.get(*field) { - match value { - JsonValue::Object(m) => { - ordered.insert(field.to_string(), json!(order_fields(m)?)); - } - JsonValue::Array(a) => { - let mut obj_arr = vec![]; - for field in a { - match field { - JsonValue::Object(m) => { - obj_arr.push(json!(order_fields(m)?)); - } - _ => { - obj_arr.push(field.clone()); - } - } - } - - ordered.insert(field.to_string(), json!(obj_arr)); - } - _ => { - ordered.insert(field.to_string(), value.clone()); - } - } - } - } - - Ok(ordered) -} - -// The following steps in parsing canonical form are handled by serde so we rely on that. -// [INTEGERS] - serde will not parse a string with a zero prefixed integer. -// [WHITESPACE] - serde also eliminates whitespace. -// [STRINGS] - TODO in `normalize_unescape` -// For rest of the steps, we implement them as below -pub(crate) fn normalize_schema(json_schema: &JsonValue) -> Result { - match json_schema { - // Normalize a complex schema - JsonValue::Object(ref scm) => { - // [PRIMITIVES] - if let Some(JsonValue::String(s)) = scm.get("type") { - match s.as_ref() { - "record" | "enum" | "array" | "maps" | "union" | "fixed" => {} - _ => { - return Ok(json!(s)); - } - } - } - - let mut schema = scm.clone(); - // [FULLNAMES] - if schema.contains_key("name") { - normalize_name(&mut schema, None)?; - } - // [ORDER] - let mut schema = order_fields(&schema)?; - // [STRIP] - normalize_strip(&mut schema)?; - Ok(json!(schema)) - } - // [PRIMITIVES] - // Normalize a primitive schema - a @ JsonValue::String(_) => Ok(json!(a)), - // Normalize a union schema - JsonValue::Array(v) => { - let mut variants = Vec::with_capacity(v.len()); - for i in v { - let normalized = normalize_schema(i)?; - variants.push(normalized); - } - Ok(json!(v)) - } - _other => Err(AvrowErr::UnknownSchema), - } -} - -#[cfg(test)] -mod tests { - use crate::Schema; - use std::str::FromStr; - #[test] - fn canonical_primitives() { - let schema_str = r##"{"type": "null"}"##; - let _ = Schema::from_str(schema_str).unwrap(); - } - - #[test] - #[cfg(feature = "fingerprint")] - fn canonical_schema_sha256_fingerprint() { - let header_schema = r##"{"type": "record", "name": "org.apache.avro.file.Header", - "fields" : [ - {"name": "magic", "type": {"type": "fixed", "name": "Magic", "size": 4}}, - {"name": "meta", "type": {"type": "map", "values": "bytes"}}, - {"name": "sync", "type": {"type": "fixed", "name": "Sync", "size": 16}} - ] - }"##; - let schema = Schema::from_str(header_schema).unwrap(); - let canonical = schema.canonical_form(); - - let expected = "809bed56cf47c84e221ad8b13e28a66ed9cd6b1498a43bad9aa0c868205e"; - let found = canonical.sha256(); - let mut fingerprint_str = String::new(); - for i in found { - let a = format!("{:x}", i); - fingerprint_str.push_str(&a); - } - - assert_eq!(expected, fingerprint_str); - } - - #[test] - #[cfg(feature = "fingerprint")] - fn schema_rabin_fingerprint() { - let schema = r##""null""##; - let expected = "0x63dd24e7cc258f8a"; - let schema = Schema::from_str(schema).unwrap(); - let canonical = schema.canonical_form(); - let actual = format!("0x{:x}", canonical.rabin64()); - assert_eq!(expected, actual); - } - - #[test] - #[cfg(feature = "fingerprint")] - fn schema_md5_fingerprint() { - let schema = r##""null""##; - let expected = "9b41ef67651c18488a8b8bb67c75699"; - let schema = Schema::from_str(schema).unwrap(); - let canonical = schema.canonical_form(); - let actual = canonical.md5(); - let mut fingerprint_str = String::new(); - for i in actual { - let a = format!("{:x}", i); - fingerprint_str.push_str(&a); - } - assert_eq!(expected, fingerprint_str); - } -} diff --git a/src/schema/common.rs b/src/schema/common.rs deleted file mode 100644 index 63c8a11..0000000 --- a/src/schema/common.rs +++ /dev/null @@ -1,331 +0,0 @@ -// This module contains definition of types that are common across a subset of -// avro Schema implementation. - -use crate::error::AvrowErr; -use crate::schema::Variant; -use crate::value::Value; -use serde_json::Value as JsonValue; -use std::fmt::{self, Display}; -use std::str::FromStr; - -/////////////////////////////////////////////////////////////////////////////// -/// Name implementation for named types: record, fixed, enum -/////////////////////////////////////////////////////////////////////////////// - -pub(crate) fn validate_name(idx: usize, name: &str) -> Result<(), AvrowErr> { - if name.contains('.') - || (name.starts_with(|a: char| a.is_ascii_digit()) && idx == 0) - || name.is_empty() - || !name.chars().any(|a| a.is_ascii_alphanumeric() || a == '_') - { - Err(AvrowErr::InvalidName) - } else { - Ok(()) - } -} - -// Follows the grammer: | [()*] -pub(crate) fn validate_namespace(s: &str) -> Result<(), AvrowErr> { - let split = s.split('.'); - for (i, n) in split.enumerate() { - let _ = validate_name(i, n).map_err(|_| AvrowErr::InvalidNamespace)?; - } - Ok(()) -} - -/// Represents the `fullname` attribute -/// of a named avro type i.e, Record, Fixed and Enum. -#[derive(Debug, Clone, Eq, PartialOrd, Ord)] -pub struct Name { - pub(crate) name: String, - pub(crate) namespace: Option, -} - -impl Name { - // Creates a new name with validation. This will extract the namespace if a dot is present in `name` - // Any further calls to set_namespace, will be a noop if the name already contains a dot. - pub(crate) fn new(name: &str) -> Result { - let mut namespace = None; - let name = if name.contains('.') { - // should not have multiple dots and dots in end or start - let _ = validate_namespace(name)?; - // strip namespace - let idx = name.rfind('.').unwrap(); // we check for ., so it's okay - namespace = Some(name[..idx].to_string()); - let name = &name[idx + 1..]; - validate_name(0, name)?; - name - } else { - validate_name(0, name)?; - name - }; - - Ok(Self { - name: name.to_string(), - namespace, - }) - } - - pub(crate) fn from_json( - json: &serde_json::map::Map, - enclosing_namespace: Option<&str>, - ) -> Result { - let mut name = if let Some(JsonValue::String(ref s)) = json.get("name") { - Name::new(s) - } else { - return Err(AvrowErr::NameParseFailed); - }?; - - // As per spec, If the name field has a dot, that is a fullname. any namespace provided is ignored. - // If no namespace was extracted from the name itself (i.e., name did not contain a dot) - // we then see if we have the namespace field on the json itself - // otherwise we use the enclosing namespace if that is a Some(namespace) - if name.namespace.is_none() { - if let Some(namespace) = json.get("namespace") { - if let JsonValue::String(s) = namespace { - validate_namespace(s)?; - name.set_namespace(s)?; - } - } else if let Some(a) = enclosing_namespace { - validate_namespace(a)?; - name.set_namespace(a)?; - } - } - - Ok(name) - } - - pub(crate) fn namespace(&self) -> Option<&str> { - self.namespace.as_deref() - } - - // receives a mutable json and parses a Name and removes namespace. Used for canonicalization. - pub(crate) fn from_json_mut( - json: &mut serde_json::map::Map, - enclosing_namespace: Option<&str>, - ) -> Result { - let mut name = if let Some(JsonValue::String(ref s)) = json.get("name") { - Name::new(s) - } else { - return Err(AvrowErr::NameParseFailed); - }?; - - if name.namespace.is_none() { - if let Some(namespace) = json.get("namespace") { - if let JsonValue::String(s) = namespace { - validate_namespace(s)?; - name.set_namespace(s)?; - json.remove("namespace"); - } - } else if let Some(a) = enclosing_namespace { - validate_namespace(a)?; - name.set_namespace(a)?; - } - } - - Ok(name) - } - - pub(crate) fn set_namespace(&mut self, namespace: &str) -> Result<(), AvrowErr> { - // empty string is a null namespace - if namespace.is_empty() { - return Ok(()); - } - - validate_namespace(namespace)?; - // If a namespace was already extracted when constructing name (name had a dot) - // then this is a noop - if self.namespace.is_none() { - let _ = validate_namespace(namespace)?; - self.namespace = Some(namespace.to_string()); - } - Ok(()) - } - - // TODO according to Rust convention, item path separators are :: instead of . - // should we add a configurable separator? - pub(crate) fn fullname(&self) -> String { - if let Some(n) = &self.namespace { - if n.is_empty() { - // According to spec, it's fine to put "" as a namespace, which becomes a null namespace - self.name.to_string() - } else { - format!("{}.{}", n, self.name) - } - } else { - self.name.to_string() - } - } -} - -impl Display for Name { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - if let Some(ref namespace) = self.namespace { - write!(f, "{}.{}", namespace, self.name) - } else { - write!(f, "{}", self.name) - } - } -} - -impl FromStr for Name { - type Err = AvrowErr; - - fn from_str(s: &str) -> Result { - Name::new(s) - } -} - -impl std::convert::TryFrom<&str> for Name { - type Error = AvrowErr; - - fn try_from(value: &str) -> Result { - Name::new(value) - } -} - -impl PartialEq for Name { - fn eq(&self, other: &Self) -> bool { - self.fullname() == other.fullname() - } -} - -/////////////////////////////////////////////////////////////////////////////// -/// Ordering for record fields -/////////////////////////////////////////////////////////////////////////////// - -#[derive(Debug, PartialEq, Clone)] -pub enum Order { - Ascending, - Descending, - Ignore, -} - -impl FromStr for Order { - type Err = AvrowErr; - fn from_str(s: &str) -> Result { - match s { - "ascending" => Ok(Order::Ascending), - "descending" => Ok(Order::Descending), - "ignore" => Ok(Order::Ignore), - _ => Err(AvrowErr::UnknownFieldOrdering), - } - } -} - -/////////////////////////////////////////////////////////////////////////////// -/// Record field definition. -/////////////////////////////////////////////////////////////////////////////// - -#[derive(Debug, Clone)] -pub struct Field { - pub(crate) name: String, - pub(crate) ty: Variant, - pub(crate) default: Option, - pub(crate) order: Order, - pub(crate) aliases: Option>, -} - -impl std::cmp::PartialEq for Field { - fn eq(&self, other: &Self) -> bool { - self.name == other.name && self.ty == other.ty && self.order == other.order - } -} - -impl Field { - pub(crate) fn new( - name: &str, - ty: Variant, - default: Option, - order: Order, - aliases: Option>, - ) -> Result { - // According to spec, field names also must adhere to a valid nane. - validate_name(0, name)?; - Ok(Field { - name: name.to_string(), - ty, - default, - order, - aliases, - }) - } -} - -#[cfg(test)] -mod tests { - use super::validate_namespace; - use super::Name; - - #[test] - #[should_panic(expected = "InvalidName")] - fn name_starts_with_number() { - Name::new("2org.apache.avro").unwrap(); - } - - #[test] - #[should_panic(expected = "InvalidNamespace")] - fn invalid_namespace() { - let mut name = Name::new("org.apache.avro").unwrap(); - name.set_namespace("23").unwrap(); - } - - #[test] - fn name_with_seperate_namespace() { - let mut name = Name::new("hello").unwrap(); - let _ = name.set_namespace("org.foo"); - assert_eq!("org.foo.hello", name.fullname().to_string()); - } - - #[test] - fn name_contains_dots() { - let name = Name::new("org.apache.avro").unwrap(); - assert_eq!("avro", name.name.to_string()); - assert_eq!("org.apache.avro", name.fullname().to_string()); - } - - #[test] - fn fullname_with_empty_namespace() { - let mut name = Name::new("org.apache.avro").unwrap(); - name.set_namespace("").unwrap(); - assert_eq!("org.apache.avro", name.fullname()); - } - - #[test] - fn multiple_dots_invalid() { - let a = "some.namespace..foo"; - assert!(validate_namespace(a).is_err()); - } - - #[test] - fn name_has_dot_and_namespace_present() { - let json_str = r##" - { - "name":"my.longlist", - "namespace":"com.some", - "type":"record" - } - "##; - let json: serde_json::Value = serde_json::from_str(json_str).unwrap(); - let name = Name::from_json(json.as_object().unwrap(), None).unwrap(); - assert_eq!(name.name, "longlist"); - assert_eq!(name.namespace, Some("my".to_string())); - assert_eq!(name.fullname(), "my.longlist"); - } - - #[test] - fn name_no_dot_and_namespace_present() { - let json_str = r##" - { - "name":"longlist", - "namespace":"com.some", - "type":"record" - } - "##; - let json: serde_json::Value = serde_json::from_str(json_str).unwrap(); - let name = Name::from_json(json.as_object().unwrap(), None).unwrap(); - assert_eq!(name.name, "longlist"); - assert_eq!(name.namespace, Some("com.some".to_string())); - assert_eq!(name.fullname(), "com.some.longlist"); - } -} diff --git a/src/schema/mod.rs b/src/schema/mod.rs deleted file mode 100644 index ba6420e..0000000 --- a/src/schema/mod.rs +++ /dev/null @@ -1,266 +0,0 @@ -//! Contains routines for parsing and validating an Avro schema. -//! Schemas in avro are written as JSON and can be provided as .avsc files -//! to a Writer or a Reader. - -pub mod common; -#[cfg(test)] -mod tests; -use crate::error::AvrowErr; -pub use common::Order; -mod canonical; -pub mod parser; -pub(crate) use parser::Registry; - -use crate::error::AvrowResult; -use crate::value::Value; -use canonical::normalize_schema; -use canonical::CanonicalSchema; -use common::{Field, Name}; -use indexmap::IndexMap; -use serde_json::{self, Value as JsonValue}; -use std::fmt::Debug; -use std::fs::OpenOptions; -use std::path::Path; - -#[derive(Debug, Clone, PartialEq)] -pub(crate) enum Variant { - Null, - Boolean, - Int, - Long, - Float, - Double, - Bytes, - Str, - Record { - name: Name, - aliases: Option>, - fields: IndexMap, - }, - Fixed { - name: Name, - size: usize, - }, - Enum { - name: Name, - aliases: Option>, - symbols: Vec, - }, - Map { - values: Box, - }, - Array { - items: Box, - }, - Union { - variants: Vec, - }, - Named(String), -} - -/// Represents the avro schema used to write encoded avro data. -#[derive(Debug)] -pub struct Schema { - // TODO can remove this if not needed - inner: JsonValue, - // Schema context that has a lookup table to resolve named schema references - pub(crate) cxt: Registry, - // typed and stripped version of schema used internally. - pub(crate) variant: Variant, - // canonical form of schema. This is used for equality. - pub(crate) canonical: CanonicalSchema, -} - -impl PartialEq for Schema { - fn eq(&self, other: &Self) -> bool { - self.canonical == other.canonical - } -} - -impl std::fmt::Display for Schema { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - std::fmt::Display::fmt(&self.inner, f) - } -} - -impl std::str::FromStr for Schema { - type Err = AvrowErr; - /// Parse an avro schema from a JSON string - /// One can use Rust's raw string syntax (r##""##) to pass schema. - fn from_str(schema: &str) -> Result { - let schema_json = - serde_json::from_str(schema).map_err(|e| AvrowErr::SchemaParseErr(e.into()))?; - Schema::parse_imp(schema_json) - } -} - -impl Schema { - /// Parses an avro schema from a JSON schema in a file. - /// Alternatively, one can use the [`FromStr`](https://doc.rust-lang.org/std/str/trait.FromStr.html) - /// impl to create the Schema from a JSON string: - /// ``` - /// use std::str::FromStr; - /// use avrow::Schema; - /// - /// let schema = Schema::from_str(r##""null""##).unwrap(); - /// ``` - pub fn from_path + Debug>(path: P) -> AvrowResult { - let schema_file = OpenOptions::new() - .read(true) - .open(&path) - .map_err(AvrowErr::SchemaParseErr)?; - let value = - serde_json::from_reader(schema_file).map_err(|e| AvrowErr::SchemaParseErr(e.into()))?; - Schema::parse_imp(value) - } - - fn parse_imp(schema_json: JsonValue) -> AvrowResult { - let mut parser = Registry::new(); - let pcf = CanonicalSchema(normalize_schema(&schema_json)?); - // TODO see if we can use canonical form to parse variant - let variant = parser.parse_schema(&schema_json, None)?; - Ok(Schema { - inner: schema_json, - cxt: parser, - variant, - canonical: pcf, - }) - } - - pub(crate) fn as_bytes(&self) -> Vec { - format!("{}", self.inner).into_bytes() - } - - pub(crate) fn variant(&self) -> &Variant { - &self.variant - } - - #[inline(always)] - pub(crate) fn validate(&self, value: &Value) -> AvrowResult<()> { - self.variant.validate(value, &self.cxt) - } - - /// Returns the canonical form of an Avro schema. - /// Example: - /// ```rust - /// use avrow::Schema; - /// use std::str::FromStr; - /// - /// let schema = Schema::from_str(r##" - /// { - /// "type": "record", - /// "name": "LongList", - /// "aliases": ["LinkedLongs"], - /// "fields" : [ - /// {"name": "value", "type": "long"}, - /// {"name": "next", "type": ["null", "LongList"] - /// }] - /// } - /// "##).unwrap(); - /// - /// let canonical = schema.canonical_form(); - /// ``` - pub fn canonical_form(&self) -> &CanonicalSchema { - &self.canonical - } -} - -impl Variant { - pub fn validate(&self, value: &Value, cxt: &Registry) -> AvrowResult<()> { - let variant = self; - match (value, variant) { - (Value::Null, Variant::Null) - | (Value::Boolean(_), Variant::Boolean) - | (Value::Int(_), Variant::Int) - // long is promotable to float or double - | (Value::Long(_), Variant::Long) - | (Value::Long(_), Variant::Float) - | (Value::Long(_), Variant::Double) - // int is promotable to long, float or double - | (Value::Int(_), Variant::Long) - | (Value::Int(_), Variant::Float) - | (Value::Int(_), Variant::Double) - | (Value::Float(_), Variant::Float) - // float is promotable to double - | (Value::Float(_), Variant::Double) - | (Value::Double(_), Variant::Double) - | (Value::Str(_), Variant::Str) - // string is promotable to bytes - | (Value::Str(_), Variant::Bytes) - // bytes is promotable to string - | (Value::Bytes(_), Variant::Str) - | (Value::Bytes(_), Variant::Bytes) => {}, - (Value::Fixed(v), Variant::Fixed { size, .. }) - | (Value::Bytes(v), Variant::Fixed { size, .. }) => { - if v.len() != *size { - return Err(AvrowErr::FixedValueLenMismatch { - found: v.len(), - expected: *size, - }); - } - } - (Value::Record(rec), Variant::Record { ref fields, .. }) => { - for (fname, fvalue) in &rec.fields { - if let Some(ftype) = fields.get(fname) { - ftype.ty.validate(&fvalue.value, cxt)?; - } else { - return Err(AvrowErr::RecordFieldMissing); - } - } - } - (Value::Map(hmap), Variant::Map { values }) => { - return if let Some(v) = hmap.values().next() { - values.validate(v, cxt) - } else { - Err(AvrowErr::EmptyMap) - } - } - (Value::Enum(sym), Variant::Enum { symbols, .. }) if symbols.contains(sym) => { - return Ok(()) - } - (Value::Array(item), Variant::Array { items }) => { - return if let Some(v) = item.first() { - items.validate(v, cxt) - } else { - Err(AvrowErr::EmptyArray) - } - } - (v, Variant::Named(name)) => { - if let Some(schema) = cxt.get(&name) { - if schema.validate(v, cxt).is_ok() { - return Ok(()); - } - } - return Err(AvrowErr::NamedSchemaNotFoundForValue) - } - // Value `a` can be any of the above schemas + any named schema in the schema registry - (a, Variant::Union { variants }) => { - for s in variants.iter() { - if s.validate(a, cxt).is_ok() { - return Ok(()); - } - } - - return Err(AvrowErr::NotFoundInUnion) - } - - (v, s) => { - return Err(AvrowErr::SchemaDataValidationFailed( - format!("{:?}", v), - format!("{:?}", s), - )) - } - } - - Ok(()) - } - - fn get_named_mut(&mut self) -> Option<&mut Name> { - match self { - Variant::Record { name, .. } - | Variant::Fixed { name, .. } - | Variant::Enum { name, .. } => Some(name), - _ => None, - } - } -} diff --git a/src/schema/parser.rs b/src/schema/parser.rs deleted file mode 100644 index 824c2be..0000000 --- a/src/schema/parser.rs +++ /dev/null @@ -1,498 +0,0 @@ -use super::common::{Field, Name, Order}; -use super::Variant; -use crate::error::io_err; -use crate::error::AvrowErr; -use crate::error::AvrowResult; -use crate::schema::common::validate_name; -use crate::value::FieldValue; -use crate::value::Value; -use indexmap::IndexMap; -use serde_json::{Map, Value as JsonValue}; -use std::borrow::ToOwned; -use std::collections::HashMap; - -// Wraps a { name -> schema } lookup table to aid parsing named references in complex schemas -// During parsing, the value for each key may get updated as a schema discovers -// more information about the schema during parsing. -#[derive(Debug, Clone)] -pub(crate) struct Registry { - // TODO: use a reference to Variant? - cxt: HashMap, -} - -impl Registry { - pub(crate) fn new() -> Self { - Self { - cxt: HashMap::new(), - } - } - - pub(crate) fn get<'a>(&'a self, name: &str) -> Option<&'a Variant> { - self.cxt.get(name) - } - - pub(crate) fn parse_schema( - &mut self, - value: &JsonValue, - enclosing_namespace: Option<&str>, - ) -> Result { - match value { - // Parse a complex schema - JsonValue::Object(ref schema) => self.parse_object(schema, enclosing_namespace), - // Parse a primitive schema, could also be a named schema reference - JsonValue::String(ref schema) => self.parse_primitive(&schema, enclosing_namespace), - // Parse a union schema - JsonValue::Array(ref schema) => self.parse_union(schema, enclosing_namespace), - _ => Err(AvrowErr::UnknownSchema), - } - } - - fn parse_union( - &mut self, - schema: &[JsonValue], - enclosing_namespace: Option<&str>, - ) -> Result { - let mut union_schema = vec![]; - for s in schema { - let parsed_schema = self.parse_schema(s, enclosing_namespace)?; - match parsed_schema { - Variant::Union { .. } => { - return Err(AvrowErr::DuplicateSchemaInUnion); - } - _ => { - if union_schema.contains(&parsed_schema) { - return Err(AvrowErr::DuplicateSchemaInUnion); - } else { - union_schema.push(parsed_schema); - } - } - } - } - Ok(Variant::Union { - variants: union_schema, - }) - } - - fn get_fullname(&self, name: &str, enclosing_namespace: Option<&str>) -> String { - if let Some(namespace) = enclosing_namespace { - format!("{}.{}", namespace, name) - } else { - name.to_string() - } - } - - /// Parse a `serde_json::Value` representing a primitive Avro type into a `Schema`. - fn parse_primitive( - &mut self, - schema: &str, - enclosing_namespace: Option<&str>, - ) -> Result { - match schema { - "null" => Ok(Variant::Null), - "boolean" => Ok(Variant::Boolean), - "int" => Ok(Variant::Int), - "long" => Ok(Variant::Long), - "double" => Ok(Variant::Double), - "float" => Ok(Variant::Float), - "bytes" => Ok(Variant::Bytes), - "string" => Ok(Variant::Str), - other if !other.is_empty() => { - let name = self.get_fullname(other, enclosing_namespace); - if self.cxt.contains_key(&name) { - Ok(Variant::Named(name)) - } else { - Err(AvrowErr::SchemaParseErr(io_err(&format!( - "named schema `{}` must be defined before use", - other - )))) - } - } - _ => Err(AvrowErr::InvalidPrimitiveSchema), - } - } - - fn parse_record_fields( - &mut self, - fields: &[serde_json::Value], - enclosing_namespace: Option<&str>, - ) -> Result, AvrowErr> { - let mut fields_parsed = IndexMap::with_capacity(fields.len()); - for field_obj in fields { - match field_obj { - JsonValue::Object(o) => { - let name = o - .get("name") - .and_then(|a| a.as_str()) - .ok_or(AvrowErr::RecordNameNotFound)?; - - let ty: &JsonValue = o.get("type").ok_or(AvrowErr::RecordTypeNotFound)?; - let mut ty = self.parse_schema(ty, enclosing_namespace)?; - - // if ty is named use enclosing namespace to construct the fullname - if let Some(name) = ty.get_named_mut() { - // if parsed type has its own namespace - if name.namespace().is_none() { - if let Some(namespace) = enclosing_namespace { - name.set_namespace(namespace)?; - } - } - } - - let default = if let Some(v) = o.get("default") { - Some(parse_default(v, &ty)?) - } else { - None - }; - - let order = if let Some(order) = o.get("order") { - parse_field_order(order)? - } else { - Order::Ascending - }; - - let aliases = parse_aliases(o.get("aliases")); - - if fields_parsed.contains_key(name) { - return Err(AvrowErr::DuplicateField); - } - - fields_parsed.insert( - name.to_string(), - Field::new(name, ty, default, order, aliases)?, - ); - } - _ => return Err(AvrowErr::InvalidRecordFieldType), - } - } - - Ok(fields_parsed) - } - - fn parse_object( - &mut self, - value: &Map, - enclosing_namespace: Option<&str>, - ) -> Result { - match value.get("type") { - Some(&JsonValue::String(ref s)) if s == "record" => { - let rec_name = Name::from_json(value, enclosing_namespace)?; - - // Insert a named reference to support recursive schema definitions. - self.cxt - .insert(rec_name.to_string(), Variant::Named(rec_name.to_string())); - - let fields = if let Some(JsonValue::Array(ref fields_vec)) = value.get("fields") { - fields_vec - } else { - return Err(AvrowErr::ExpectedFieldsJsonArray); - }; - - let fields = self.parse_record_fields(fields, { - if rec_name.namespace().is_some() { - // Most tightly enclosing namespace, which is this namespace - rec_name.namespace() - } else { - enclosing_namespace - } - })?; - - let aliases = parse_aliases(value.get("aliases")); - - let rec = Variant::Record { - name: rec_name.clone(), - aliases, - fields, - }; - - let rec_for_registry = rec.clone(); - let rec_name = rec_name.to_string(); - - // if a record schema is being redefined throw an error. - if let Some(Variant::Named(_)) = self.cxt.get(&rec_name) { - self.cxt.insert(rec_name, rec_for_registry); - } else { - return Err(AvrowErr::DuplicateSchema); - } - - Ok(rec) - } - Some(&JsonValue::String(ref s)) if s == "enum" => { - let name = Name::from_json(value, enclosing_namespace)?; - let aliases = parse_aliases(value.get("aliases")); - let mut symbols = vec![]; - - if let Some(v) = value.get("symbols") { - match v { - JsonValue::Array(sym) => { - // let mut symbols = Vec::with_capacity(sym.len()); - for v in sym { - let symbol = v.as_str().ok_or(AvrowErr::EnumSymbolParseErr)?; - validate_name(0, symbol)?; - symbols.push(symbol.to_string()); - } - } - other => { - return Err(AvrowErr::EnumParseErr(format!("{:?}", other))); - } - } - } else { - return Err(AvrowErr::EnumSymbolsMissing); - } - - let name_str = name.fullname(); - - let enum_schema = Variant::Enum { - name, - aliases, - symbols, - }; - - self.cxt.insert(name_str, enum_schema.clone()); - - Ok(enum_schema) - } - Some(&JsonValue::String(ref s)) if s == "array" => { - let item_missing_err = AvrowErr::SchemaParseErr(io_err( - "Array schema must have `items` field defined", - )); - let items_schema = value.get("items").ok_or(item_missing_err)?; - let parsed_items = self.parse_schema(items_schema, enclosing_namespace)?; - Ok(Variant::Array { - items: Box::new(parsed_items), - }) - } - Some(&JsonValue::String(ref s)) if s == "map" => { - let item_missing_err = - AvrowErr::SchemaParseErr(io_err("Map schema must have `values` field defined")); - let items_schema = value.get("values").ok_or(item_missing_err)?; - let parsed_items = self.parse_schema(items_schema, enclosing_namespace)?; - Ok(Variant::Map { - values: Box::new(parsed_items), - }) - } - Some(&JsonValue::String(ref s)) if s == "fixed" => { - let name = Name::from_json(value, enclosing_namespace)?; - let size = value.get("size").ok_or(AvrowErr::FixedSizeNotFound)?; - let name_str = name.fullname(); - - let fixed_schema = Variant::Fixed { - name, - size: size.as_u64().ok_or(AvrowErr::FixedSizeNotNumber)? as usize, // clamp to usize - }; - - self.cxt.insert(name_str, fixed_schema.clone()); - - Ok(fixed_schema) - } - Some(JsonValue::String(ref s)) if s == "null" => Ok(Variant::Null), - Some(JsonValue::String(ref s)) if s == "boolean" => Ok(Variant::Boolean), - Some(JsonValue::String(ref s)) if s == "int" => Ok(Variant::Int), - Some(JsonValue::String(ref s)) if s == "long" => Ok(Variant::Long), - Some(JsonValue::String(ref s)) if s == "float" => Ok(Variant::Float), - Some(JsonValue::String(ref s)) if s == "double" => Ok(Variant::Double), - Some(JsonValue::String(ref s)) if s == "bytes" => Ok(Variant::Bytes), - Some(JsonValue::String(ref s)) if s == "string" => Ok(Variant::Str), - _other => Err(AvrowErr::SchemaParseFailed), - } - } -} - -// TODO add support if needed -// fn parse_doc(value: Option<&JsonValue>) -> Option { -// if let Some(JsonValue::String(s)) = value { -// Some(s.to_string()) -// } else { -// None -// } -// } - -// Parses the `order` of a field, defaults to `ascending` order -pub(crate) fn parse_field_order(order: &JsonValue) -> AvrowResult { - match order { - JsonValue::String(s) => match s.as_ref() { - "ascending" => Ok(Order::Ascending), - "descending" => Ok(Order::Descending), - "ignore" => Ok(Order::Ignore), - _ => Err(AvrowErr::UnknownFieldOrdering), - }, - _ => Err(AvrowErr::InvalidFieldOrdering), - } -} - -// Parses aliases of a field -fn parse_aliases(aliases: Option<&JsonValue>) -> Option> { - match aliases { - Some(JsonValue::Array(ref aliases)) => { - let mut alias_parsed = Vec::with_capacity(aliases.len()); - for a in aliases { - let a = a.as_str().map(ToOwned::to_owned)?; - alias_parsed.push(a); - } - Some(alias_parsed) - } - _ => None, - } -} - -pub(crate) fn parse_default( - default_value: &JsonValue, - schema_variant: &Variant, -) -> Result { - match (default_value, schema_variant) { - (d, Variant::Union { variants }) => { - let first_variant = variants.first().ok_or(AvrowErr::FailedDefaultUnion)?; - parse_default(d, first_variant) - } - (JsonValue::Null, Variant::Null) => Ok(Value::Null), - (JsonValue::Bool(v), Variant::Boolean) => Ok(Value::Boolean(*v)), - (JsonValue::Number(n), Variant::Int) => Ok(Value::Int(n.as_i64().unwrap() as i32)), - (JsonValue::Number(n), Variant::Long) => Ok(Value::Long(n.as_i64().unwrap())), - (JsonValue::Number(n), Variant::Float) => Ok(Value::Float(n.as_f64().unwrap() as f32)), - (JsonValue::Number(n), Variant::Double) => Ok(Value::Double(n.as_f64().unwrap() as f64)), - (JsonValue::String(n), Variant::Bytes) => Ok(Value::Bytes(n.as_bytes().to_vec())), - (JsonValue::String(n), Variant::Str) => Ok(Value::Str(n.clone())), - (JsonValue::Object(v), Variant::Record { name, fields, .. }) => { - let mut values = IndexMap::with_capacity(v.len()); - - for (k, v) in v { - let parsed_value = - parse_default(v, &fields.get(k).ok_or(AvrowErr::DefaultValueParse)?.ty)?; - values.insert(k.to_string(), FieldValue::new(parsed_value)); - } - - Ok(Value::Record(crate::value::Record { - fields: values, - name: name.to_string(), - })) - } - (JsonValue::String(n), Variant::Enum { symbols, .. }) => { - if symbols.contains(n) { - Ok(Value::Str(n.clone())) - } else { - Err(AvrowErr::EnumSymbolNotPresent) - } - } - (JsonValue::Array(arr), Variant::Array { items }) => { - let mut default_arr_items: Vec = Vec::with_capacity(arr.len()); - for v in arr { - let parsed_default = parse_default(v, items); - default_arr_items.push(parsed_default?); - } - - Ok(Value::Array(default_arr_items)) - } - ( - JsonValue::Object(map), - Variant::Map { - values: values_schema, - }, - ) => { - let mut values = std::collections::HashMap::with_capacity(map.len()); - for (k, v) in map { - let parsed_value = parse_default(v, values_schema)?; - values.insert(k.to_string(), parsed_value); - } - - Ok(Value::Map(values)) - } - - (JsonValue::String(n), Variant::Fixed { .. }) => Ok(Value::Fixed(n.as_bytes().to_vec())), - (_d, _s) => Err(AvrowErr::DefaultValueParse), - } -} - -#[cfg(test)] -mod tests { - use crate::schema::common::Order; - use crate::schema::Field; - use crate::schema::Name; - use crate::schema::Variant; - use crate::Schema; - use crate::Value; - use indexmap::IndexMap; - use std::str::FromStr; - #[test] - fn schema_parse_default_values() { - let schema = Schema::from_str( - r##"{ - "type": "record", - "name": "Can", - "doc":"Represents a can data", - "namespace": "com.avrow", - "aliases": ["my_linked_list"], - "fields" : [ - { - "name": "next", - "type": ["null", "Can"] - }, - { - "name": "value", - "type": "long", - "default": 1, - "aliases": ["data"], - "order": "descending", - "doc": "This field holds the value of the linked list" - } - ] - }"##, - ) - .unwrap(); - - let mut fields = IndexMap::new(); - let f1 = Field::new( - "value", - Variant::Long, - Some(Value::Long(1)), - Order::Descending, - None, - ) - .unwrap(); - let f2 = Field::new( - "next", - Variant::Union { - variants: vec![Variant::Null, Variant::Named("com.avrow.Can".to_string())], - }, - None, - Order::Ascending, - None, - ) - .unwrap(); - fields.insert("value".to_string(), f1); - fields.insert("next".to_string(), f2); - - let mut name = Name::new("Can").unwrap(); - name.set_namespace("com.avrow").unwrap(); - - let s = Variant::Record { - name, - aliases: Some(vec!["my_linked_list".to_string()]), - fields, - }; - - assert_eq!(&s, schema.variant()); - } - - #[test] - fn nested_record_fields_parses_properly_with_fullnames() { - let schema = Schema::from_str(r##"{ - "name": "longlist", - "namespace": "com.some", - "type":"record", - "fields": [ - {"name": "magic", "type": {"type": "fixed", "name": "magic", "size": 4, "namespace": "com.bar"} - }, - {"name": "inner_rec", "type": {"type": "record", "name": "inner_rec", "fields": [ - { - "name": "test", - "type": {"type": "fixed", "name":"hello", "size":5} - } - ]}} - ] - }"##).unwrap(); - - assert!(schema.cxt.cxt.contains_key("com.bar.magic")); - assert!(schema.cxt.cxt.contains_key("com.some.hello")); - assert!(schema.cxt.cxt.contains_key("com.some.longlist")); - assert!(schema.cxt.cxt.contains_key("com.some.inner_rec")); - } -} diff --git a/src/schema/tests.rs b/src/schema/tests.rs deleted file mode 100644 index f6de8bc..0000000 --- a/src/schema/tests.rs +++ /dev/null @@ -1,455 +0,0 @@ -use super::common::{Field, Name, Order}; -use super::{Schema, Variant}; -use indexmap::IndexMap; -use std::collections::HashMap; -use std::str::FromStr; - -fn primitive_schema_objects() -> HashMap<&'static str, Variant> { - let mut s = HashMap::new(); - s.insert(r##"{ "type": "null" }"##, Variant::Null); - s.insert(r##"{ "type": "boolean" }"##, Variant::Boolean); - s.insert(r##"{ "type": "int" }"##, Variant::Int); - s.insert(r##"{ "type": "long" }"##, Variant::Long); - s.insert(r##"{ "type": "float" }"##, Variant::Float); - s.insert(r##"{ "type": "double" }"##, Variant::Double); - s.insert(r##"{ "type": "bytes" }"##, Variant::Bytes); - s.insert(r##"{ "type": "string" }"##, Variant::Str); - s -} - -fn primitive_schema_canonical() -> HashMap<&'static str, Variant> { - let mut s = HashMap::new(); - s.insert(r##""null""##, Variant::Null); - s.insert(r##""boolean""##, Variant::Boolean); - s.insert(r##""int""##, Variant::Int); - s.insert(r##""long""##, Variant::Long); - s.insert(r##""float""##, Variant::Float); - s.insert(r##""double""##, Variant::Double); - s.insert(r##""bytes""##, Variant::Bytes); - s.insert(r##""string""##, Variant::Str); - s -} - -#[test] -fn parse_primitives_as_json_objects() { - for (s, v) in primitive_schema_objects() { - let schema = Schema::from_str(s).unwrap(); - assert_eq!(schema.variant, v); - } -} - -#[test] -fn parse_primitives_as_defined_types() { - for (s, v) in primitive_schema_canonical() { - let schema = Schema::from_str(s).unwrap(); - assert_eq!(schema.variant, v); - } -} - -#[test] -fn parse_record() { - let record_schema = Schema::from_str( - r##"{ - "type": "record", - "name": "LongOrNull", - "namespace":"com.test", - "aliases": ["MaybeLong"], - "fields" : [ - {"name": "value", "type": "long"}, - {"name": "other", "type": ["null", "LongOrNull"]} - ] - }"##, - ) - .unwrap(); - - let union_variants = vec![ - Variant::Null, - Variant::Named("com.test.LongOrNull".to_string()), - ]; - - let mut fields_map = IndexMap::new(); - fields_map.insert( - "value".to_string(), - Field::new("value", Variant::Long, None, Order::Ascending, None).unwrap(), - ); - fields_map.insert( - "other".to_string(), - Field::new( - "other", - Variant::Union { - variants: union_variants, - }, - None, - Order::Ascending, - None, - ) - .unwrap(), - ); - - let mut name = Name::new("LongOrNull").unwrap(); - name.set_namespace("com.test").unwrap(); - - assert_eq!( - record_schema.variant, - Variant::Record { - name, - aliases: Some(vec!["MaybeLong".to_string()]), - fields: fields_map, - } - ); -} - -#[test] -fn parse_fixed() { - let fixed_schema = - Schema::from_str(r##"{"type": "fixed", "size": 16, "name": "md5"}"##).unwrap(); - assert_eq!( - fixed_schema.variant, - Variant::Fixed { - name: Name::new("md5").unwrap(), - size: 16 - } - ); -} - -#[test] -fn parse_enum() { - let json = r##"{ - "type": "enum", - "name": "Suit", - "symbols" : ["SPADES", "HEARTS", "DIAMONDS", "CLUBS"] - }"##; - let enum_schema = Schema::from_str(json).unwrap(); - let name = Name::new("Suit").unwrap(); - let mut symbols = vec![]; - symbols.push("SPADES".to_owned()); - symbols.push("HEARTS".to_owned()); - symbols.push("DIAMONDS".to_owned()); - symbols.push("CLUBS".to_owned()); - - assert_eq!( - enum_schema.variant, - Variant::Enum { - name, - aliases: None, - symbols - } - ); -} - -#[test] -fn parse_array() { - let json = r##"{"type": "array", "items": "string"}"##; - let array_schema = Schema::from_str(json).unwrap(); - assert_eq!( - array_schema.variant, - Variant::Array { - items: Box::new(Variant::Str) - } - ); -} - -#[test] -fn parse_map() { - let map_schema = Schema::from_str(r##"{"type": "map", "values": "long"}"##).unwrap(); - assert_eq!( - map_schema.variant, - Variant::Map { - values: Box::new(Variant::Long) - } - ); -} - -/////////////////////////////////////////////////////////////////////////////// -/// Union -/////////////////////////////////////////////////////////////////////////////// - -#[test] -fn parse_simple_union() { - let union_schema = Schema::from_str(r##"["null", "string"]"##).unwrap(); - assert_eq!( - union_schema.variant, - Variant::Union { - variants: vec![Variant::Null, Variant::Str] - } - ); -} - -#[test] -#[should_panic] -fn parse_union_duplicate_primitive_fails() { - let mut results = vec![]; - for i in primitive_schema_canonical() { - let json = &format!("[{}, {}]", i.0, i.0); - results.push(Schema::from_str(json).is_err()); - } - - assert!(results.iter().any(|a| !(*a))); -} - -#[test] -fn parse_union_with_different_named_type_but_same_schema_succeeds() { - let union_schema = Schema::from_str( - r##"[ - { - "type":"record", - "name": "record_one", - "fields" : [ - {"name": "value", "type": "long"} - ] - }, - { - "type":"record", - "name": "record_two", - "fields" : [ - {"name": "value", "type": "long"} - ] - }]"##, - ); - - assert!(union_schema.is_ok()); -} - -#[test] -fn parse_union_with_same_named_type_fails() { - let union_schema = Schema::from_str( - r##"[ - { - "type":"record", - "name": "record_one", - "fields" : [ - {"name": "value", "type": "long"} - ] - }, - { - "type":"record", - "name": "record_one", - "fields" : [ - {"name": "value", "type": "long"} - ] - }]"##, - ); - - assert!(union_schema.is_err()); -} - -#[test] -fn parse_union_field_invalid_default_values() { - let default_valued_schema = Schema::from_str( - r##" - { - "name": "Company", - "type": "record", - "fields": [ - { - "name": "emp_name", - "type": "string", - "doc": "employee name" - }, - { - "name": "bonus", - "type": ["null", "long"], - "default": null, - "doc": "bonus received on a yearly basis" - }, - { - "name": "subordinates", - "type": ["null", {"type": "map", "values": "string"}], - "default": {"foo":"bar"}, - "doc": "map of subordinates Name and Designation" - }, - { - "name": "departments", - "type":["null", {"type":"array", "items":"string" }], - "default": ["Sam", "Bob"], - "doc": "Departments under the employee" - } - ] - } - "##, - ); - - assert!(default_valued_schema.is_err()); -} - -#[test] -fn parse_default_values_record() { - let default_valued_schema = Schema::from_str( - r##" - { - "name": "Company", - "type": "record", - "namespace": "com.test.avrow", - "fields": [ - { - "name": "bonus", - "type": ["null", "long"], - "default": null, - "doc": "bonus received on a yearly basis" - } - ] - } - "##, - ); - - assert!(default_valued_schema.is_ok()); -} - -#[test] -#[should_panic(expected = "DuplicateSchema")] -fn fails_on_duplicate_schema() { - let schema = r##"{ - "type": "record", - "namespace": "test.avro.training", - "name": "SomeMessage", - "fields": [{ - "name": "is_error", - "type": "boolean", - "default": false - }, { - "name": "outcome", - "type": [{ - "type": "record", - "name": "SomeMessage", - "fields": [] - }, { - "type": "record", - "name": "ErrorRecord", - "fields": [{ - "name": "errors", - "type": { - "type": "map", - "values": "string" - }, - "doc": "doc" - }] - }] - }] - }"##; - - Schema::from_str(schema).unwrap(); -} - -#[test] -#[should_panic] -fn parse_immediate_unions_fails() { - let default_valued_schema = Schema::from_str( - r##" - ["null", "string", ["null", "int"]]"##, - ); - - assert!(default_valued_schema.is_ok()); -} - -#[test] -fn parse_simple_default_values_record() { - let _default_valued_schema = Schema::from_str( - r##" - { - "name": "com.school.Student", - "type": "record", - "fields": [ - { - "name": "departments", - "type":[{"type":"array", "items":"string" }, "null"], - "default": ["Computer Science", "Finearts"], - "doc": "Departments of a student" - } - ] - } - "##, - ) - .unwrap(); -} - -#[test] -fn parse_default_record_value_in_union() { - let schema = Schema::from_str( - r##" - { - "name": "com.big.data.avro.schema.Employee", - "type": "record", - "fields": [ - { - "name": "departments", - "type":[ - {"type":"record", - "name": "dept_name", - "fields":[{"name":"id","type": "string"}, {"name":"foo", "type": "null"}] }], - "default": {"id": "foo", "foo": null} - } - ] - } - "##, - ) - .unwrap(); - - if let Variant::Record { fields, .. } = schema.variant { - match &fields["departments"].default { - Some(crate::Value::Record(r)) => { - assert!(r.fields.contains_key("id")); - assert_eq!( - r.fields["id"], - crate::value::FieldValue::new(crate::Value::Str("foo".to_string())) - ); - } - _ => panic!("should be a record"), - } - } -} - -#[test] -#[should_panic(expected = "must be defined before use")] -fn named_schema_must_be_defined_before_being_used() { - let _schema = Schema::from_str( - r##"{ - "type": "record", - "name": "LongList", - "aliases": ["LinkedLongs"], - "fields" : [ - {"name": "value", "type": "long"}, - {"name": "next", "type": ["null", "OtherList"]} - ] - }"##, - ) - .unwrap(); -} - -#[test] -fn test_two_instance_schema_equality() { - let raw_schema = r#" - { - "type": "record", - "name": "User", - "doc": "Hi there.", - "fields": [ - {"name": "likes_pizza", "type": "boolean", "default": false}, - {"name": "aa-i32", - "type": {"type": "array", "items": {"type": "array", "items": "int"}}, - "default": [[0], [12, -1]]} - ] - } - "#; - - let schema = Schema::from_str(raw_schema).unwrap(); - let schema2 = Schema::from_str(raw_schema).unwrap(); - assert_eq!(schema, schema2); -} - -#[test] -#[should_panic(expected = "DuplicateField")] -fn duplicate_field_name_in_record_fails() { - let raw_schema = r#" - { - "type": "record", - "name": "Person", - "doc": "Hi there.", - "fields": [ - {"name": "id", "type": "string", "default": "dsf8e8"}, - {"name": "id", "type": "int", "default": 56} - ] - } - "#; - - Schema::from_str(raw_schema).unwrap(); -} diff --git a/src/serde_avro/de.rs b/src/serde_avro/de.rs deleted file mode 100644 index fec2a41..0000000 --- a/src/serde_avro/de.rs +++ /dev/null @@ -1,170 +0,0 @@ -use super::de_impl::{ArrayDeserializer, ByteSeqDeserializer, MapDeserializer, StructReader}; -use crate::error::AvrowErr; - -use crate::value::Value; - -use serde::de::IntoDeserializer; -use serde::de::{self, Visitor}; -use serde::forward_to_deserialize_any; - -pub(crate) struct SerdeReader<'de> { - pub(crate) inner: &'de Value, -} - -impl<'de> SerdeReader<'de> { - pub(crate) fn new(inner: &'de Value) -> Self { - SerdeReader { inner } - } -} - -impl<'de, 'a> de::Deserializer<'de> for &'a mut SerdeReader<'de> { - type Error = AvrowErr; - - fn deserialize_any(self, visitor: V) -> Result - where - V: Visitor<'de>, - { - match self.inner { - Value::Null => visitor.visit_unit(), - Value::Boolean(v) => visitor.visit_bool(*v), - Value::Int(v) => visitor.visit_i32(*v), - Value::Long(v) => visitor.visit_i64(*v), - Value::Float(v) => visitor.visit_f32(*v), - Value::Double(v) => visitor.visit_f64(*v), - Value::Str(ref v) => visitor.visit_borrowed_str(v), - Value::Bytes(ref bytes) => visitor.visit_borrowed_bytes(&bytes), - Value::Array(items) => visitor.visit_seq(ArrayDeserializer::new(&items)), - Value::Enum(s) => visitor.visit_enum(s.as_str().into_deserializer()), - _ => Err(AvrowErr::Unsupported), - } - } - - forward_to_deserialize_any! { - unit bool u8 i8 i16 i32 i64 u16 u32 u64 f32 f64 str bytes byte_buf string ignored_any enum - } - - fn deserialize_option(self, visitor: V) -> Result - where - V: Visitor<'de>, - { - visitor.visit_some(self) - } - - fn deserialize_unit_struct( - self, - _name: &'static str, - visitor: V, - ) -> Result - where - V: Visitor<'de>, - { - visitor.visit_unit() - } - - fn deserialize_seq(self, visitor: V) -> Result - where - V: Visitor<'de>, - { - match self.inner { - Value::Array(ref items) => visitor.visit_seq(ArrayDeserializer::new(items)), - // TODO figure out the correct byte stram to use - Value::Bytes(buf) | Value::Fixed(buf) => { - let byte_seq_deser = ByteSeqDeserializer { input: buf.iter() }; - visitor.visit_seq(byte_seq_deser) - } - Value::Union(v) => match v.as_ref() { - Value::Array(ref items) => visitor.visit_seq(ArrayDeserializer::new(items)), - _ => Err(AvrowErr::Unsupported), - }, - _ => Err(AvrowErr::Unsupported), - } - } - - // avro bytes - fn deserialize_tuple(self, _len: usize, visitor: V) -> Result - where - V: serde::de::Visitor<'de>, - { - self.deserialize_seq(visitor) - } - - // for struct field - fn deserialize_identifier(self, visitor: V) -> Result - where - V: Visitor<'de>, - { - self.deserialize_str(visitor) - } - - fn deserialize_map(self, visitor: V) -> Result - where - V: Visitor<'de>, - { - match self.inner { - Value::Map(m) => { - let map_de = MapDeserializer { - keys: m.keys(), - values: m.values(), - }; - visitor.visit_map(map_de) - } - v => Err(AvrowErr::UnexpectedAvroValue { - value: format!("{:?}", v), - }), - } - } - - fn deserialize_struct( - self, - _a: &'static str, - _b: &'static [&'static str], - visitor: V, - ) -> Result - where - V: Visitor<'de>, - { - match self.inner { - Value::Record(ref r) => visitor.visit_map(StructReader::new(r.fields.iter())), - Value::Union(ref inner) => match **inner { - Value::Record(ref rec) => visitor.visit_map(StructReader::new(rec.fields.iter())), - _ => Err(de::Error::custom("Union variant not a record/struct")), - }, - _ => Err(de::Error::custom("Must be a record/struct")), - } - } - - /////////////////////////////////////////////////////////////////////////// - /// Not yet supported types - /////////////////////////////////////////////////////////////////////////// - - fn deserialize_tuple_struct( - self, - _name: &'static str, - _len: usize, - _visitor: V, - ) -> Result - where - V: Visitor<'de>, - { - // TODO it is not clear to what avro schema can a tuple map to - Err(AvrowErr::Unsupported) - } - - fn deserialize_newtype_struct( - self, - _name: &'static str, - _visitor: V, - ) -> Result - where - V: Visitor<'de>, - { - Err(AvrowErr::Unsupported) - } - - fn deserialize_char(self, _visitor: V) -> Result - where - V: Visitor<'de>, - { - Err(AvrowErr::Unsupported) - } -} diff --git a/src/serde_avro/de_impl.rs b/src/serde_avro/de_impl.rs deleted file mode 100644 index eb47bba..0000000 --- a/src/serde_avro/de_impl.rs +++ /dev/null @@ -1,193 +0,0 @@ -use super::de::SerdeReader; -use crate::error::AvrowErr; -use crate::value::FieldValue; -use crate::Value; -use indexmap::map::Iter as MapIter; -use serde::de; -use serde::de::DeserializeSeed; -use serde::de::Visitor; -use serde::forward_to_deserialize_any; -use std::collections::hash_map::Keys; -use std::collections::hash_map::Values; -use std::slice::Iter; - -pub(crate) struct StructReader<'de> { - input: MapIter<'de, String, FieldValue>, - value: Option<&'de FieldValue>, -} - -impl<'de> StructReader<'de> { - pub fn new(input: MapIter<'de, String, FieldValue>) -> Self { - StructReader { input, value: None } - } -} - -impl<'de> de::MapAccess<'de> for StructReader<'de> { - type Error = AvrowErr; - - fn next_key_seed(&mut self, seed: K) -> Result, Self::Error> - where - K: DeserializeSeed<'de>, - { - match self.input.next() { - Some(item) => { - let (ref field, ref value) = item; - self.value = Some(value); - seed.deserialize(StrDeserializer { input: &field }) - .map(Some) - } - None => Ok(None), - } - } - - fn next_value_seed(&mut self, seed: V) -> Result - where - V: DeserializeSeed<'de>, - { - let a = self.value.take(); - if let Some(a) = a { - match &a.value { - Value::Null => seed.deserialize(NullDeserializer), - value => seed.deserialize(&mut SerdeReader { inner: &value }), - } - } else { - Err(de::Error::custom("Unexpected call to next_value_seed.")) - } - } -} - -pub(crate) struct ArrayDeserializer<'de> { - input: Iter<'de, Value>, -} - -impl<'de> ArrayDeserializer<'de> { - pub fn new(input: &'de [Value]) -> Self { - Self { - input: input.iter(), - } - } -} - -impl<'de> de::SeqAccess<'de> for ArrayDeserializer<'de> { - type Error = AvrowErr; - - fn next_element_seed(&mut self, seed: T) -> Result, Self::Error> - where - T: DeserializeSeed<'de>, - { - match self.input.next() { - Some(item) => seed.deserialize(&mut SerdeReader::new(item)).map(Some), - None => Ok(None), - } - } -} - -pub(crate) struct ByteSeqDeserializer<'de> { - pub(crate) input: Iter<'de, u8>, -} - -impl<'de> de::SeqAccess<'de> for ByteSeqDeserializer<'de> { - type Error = AvrowErr; - - fn next_element_seed(&mut self, seed: T) -> Result, Self::Error> - where - T: DeserializeSeed<'de>, - { - match self.input.next() { - Some(item) => seed.deserialize(ByteDeserializer { byte: item }).map(Some), - None => Ok(None), - } - } -} - -pub(crate) struct ByteDeserializer<'de> { - pub(crate) byte: &'de u8, -} - -impl<'de> de::Deserializer<'de> for ByteDeserializer<'de> { - type Error = AvrowErr; - - fn deserialize_any(self, visitor: V) -> Result - where - V: Visitor<'de>, - { - visitor.visit_u8(*self.byte) - } - - forward_to_deserialize_any! { - bool u8 u16 u32 u64 i8 i16 i32 i64 f32 f64 char str string unit option - seq bytes byte_buf map unit_struct newtype_struct - tuple_struct struct tuple enum identifier ignored_any - } -} - -pub(crate) struct MapDeserializer<'de> { - pub(crate) keys: Keys<'de, String, Value>, - pub(crate) values: Values<'de, String, Value>, -} - -impl<'de> de::MapAccess<'de> for MapDeserializer<'de> { - type Error = AvrowErr; - - fn next_key_seed(&mut self, seed: K) -> Result, Self::Error> - where - K: DeserializeSeed<'de>, - { - match self.keys.next() { - Some(key) => seed.deserialize(StrDeserializer { input: key }).map(Some), - None => Ok(None), - } - } - - fn next_value_seed(&mut self, seed: V) -> Result - where - V: DeserializeSeed<'de>, - { - match self.values.next() { - Some(value) => seed.deserialize(&mut SerdeReader::new(value)), - None => Err(Self::Error::Message( - "Unexpected call to next_value_seed".to_string(), - )), - } - } -} - -pub(crate) struct StrDeserializer<'de> { - input: &'de str, -} - -impl<'de> de::Deserializer<'de> for StrDeserializer<'de> { - type Error = AvrowErr; - - fn deserialize_any(self, visitor: V) -> Result - where - V: Visitor<'de>, - { - visitor.visit_borrowed_str(&self.input) - } - - forward_to_deserialize_any! { - bool u8 u16 u32 u64 i8 i16 i32 i64 f32 f64 char str string unit option - seq bytes byte_buf map unit_struct newtype_struct - tuple_struct struct tuple enum identifier ignored_any - } -} - -pub(crate) struct NullDeserializer; - -impl<'de> de::Deserializer<'de> for NullDeserializer { - type Error = AvrowErr; - - fn deserialize_any(self, visitor: V) -> Result - where - V: Visitor<'de>, - { - visitor.visit_none() - } - - forward_to_deserialize_any! { - bool u8 u16 u32 u64 i8 i16 i32 i64 f32 f64 char str string unit option - seq bytes byte_buf map unit_struct newtype_struct - tuple_struct struct tuple enum identifier ignored_any - } -} diff --git a/src/serde_avro/mod.rs b/src/serde_avro/mod.rs deleted file mode 100644 index af2f22b..0000000 --- a/src/serde_avro/mod.rs +++ /dev/null @@ -1,8 +0,0 @@ -mod de; -mod de_impl; -mod ser; -mod ser_impl; - -pub(crate) use self::de::SerdeReader; -pub use self::ser::{to_value, SerdeWriter}; -pub use crate::error::AvrowErr; diff --git a/src/serde_avro/ser.rs b/src/serde_avro/ser.rs deleted file mode 100644 index 359dc9e..0000000 --- a/src/serde_avro/ser.rs +++ /dev/null @@ -1,261 +0,0 @@ -use super::ser_impl::{MapSerializer, SeqSerializer, StructSerializer}; -use crate::error::AvrowErr; -use crate::value::Value; -use serde::ser::{self, Serialize}; - -pub struct SerdeWriter; - -/// `to_value` is the serde API for serialization of Rust types to an [avrow::Value](enum.Value.html) -pub fn to_value(value: &T) -> Result -where - T: Serialize, -{ - let mut serializer = SerdeWriter; - value.serialize(&mut serializer) -} - -impl<'b> ser::Serializer for &'b mut SerdeWriter { - type Ok = Value; - type Error = AvrowErr; - type SerializeSeq = SeqSerializer; - type SerializeMap = MapSerializer; - type SerializeStruct = StructSerializer; - type SerializeTuple = SeqSerializer; - type SerializeTupleStruct = Unsupported; - type SerializeTupleVariant = Unsupported; - type SerializeStructVariant = Unsupported; - - fn serialize_bool(self, v: bool) -> Result { - Ok(Value::Boolean(v)) - } - - fn serialize_i8(self, v: i8) -> Result { - Ok(Value::Byte(v as u8)) - } - - fn serialize_i16(self, v: i16) -> Result { - Ok(Value::Int(v as i32)) - } - - fn serialize_i32(self, v: i32) -> Result { - Ok(Value::Int(v as i32)) - } - - fn serialize_i64(self, v: i64) -> Result { - Ok(Value::Long(v)) - } - - fn serialize_u8(self, v: u8) -> Result { - // using the auxiliary avro value - Ok(Value::Byte(v)) - } - - fn serialize_u16(self, v: u16) -> Result { - Ok(Value::Int(v as i32)) - } - - fn serialize_u32(self, v: u32) -> Result { - Ok(Value::Int(v as i32)) - } - - fn serialize_u64(self, v: u64) -> Result { - Ok(Value::Long(v as i64)) - } - - fn serialize_f32(self, v: f32) -> Result { - Ok(Value::Float(v)) - } - - fn serialize_f64(self, v: f64) -> Result { - Ok(Value::Double(v)) - } - - fn serialize_char(self, v: char) -> Result { - Ok(Value::Str(v.to_string())) - } - - fn serialize_str(self, v: &str) -> Result { - Ok(Value::Str(v.to_owned())) - } - - fn serialize_bytes(self, v: &[u8]) -> Result { - // todo: identify call path to this - Ok(Value::Bytes(v.to_owned())) - } - - fn serialize_none(self) -> Result { - Ok(Value::Null) - } - - fn serialize_some(self, value: &T) -> Result - where - T: Serialize, - { - Ok(value.serialize(&mut SerdeWriter)?) - } - - fn serialize_unit(self) -> Result { - Ok(Value::Null) - } - - fn serialize_unit_struct(self, _: &'static str) -> Result { - self.serialize_unit() - } - - fn serialize_unit_variant( - self, - _name: &'static str, - _index: u32, - variant: &'static str, - ) -> Result { - Ok(Value::Enum(variant.to_string())) - } - - fn serialize_newtype_struct( - self, - _: &'static str, - value: &T, - ) -> Result - where - T: Serialize, - { - value.serialize(self) - } - - fn serialize_seq(self, len: Option) -> Result { - Ok(SeqSerializer::new(len)) - } - - fn serialize_map(self, len: Option) -> Result { - Ok(MapSerializer::new(len)) - } - - fn serialize_struct( - self, - name: &'static str, - len: usize, - ) -> Result { - Ok(StructSerializer::new(name, len)) - } - - fn serialize_tuple(self, _len: usize) -> Result { - self.serialize_seq(Some(_len)) - } - - fn serialize_tuple_struct( - self, - _: &'static str, - _len: usize, - ) -> Result { - unimplemented!("Avro does not support Rust tuple structs"); - } - - fn serialize_tuple_variant( - self, - _: &'static str, - _: u32, - _: &'static str, - _: usize, - ) -> Result { - // TODO Is there a way we can map union type to some valid avro type - Err(AvrowErr::Message( - "Tuple type is not currently supported as per avro spec".to_string(), - )) - } - - fn serialize_struct_variant( - self, - _: &'static str, - _: u32, - _: &'static str, - _: usize, - ) -> Result { - unimplemented!("Avro enums does not support struct variants in enum") - } - - fn serialize_newtype_variant( - self, - _: &'static str, - _: u32, - _: &'static str, - _value: &T, - ) -> Result - where - T: Serialize, - { - unimplemented!("Avro does not support newtype struct variants in enums"); - } -} - -/////////////////////////////////////////////////////////////////////////////// -/// Unsupported types in avro -/////////////////////////////////////////////////////////////////////////////// - -pub struct Unsupported; - -// struct enum variant -impl ser::SerializeStructVariant for Unsupported { - type Ok = Value; - type Error = AvrowErr; - - fn serialize_field(&mut self, _: &'static str, _: &T) -> Result<(), Self::Error> - where - T: Serialize, - { - unimplemented!("Avro enums does not support data in its variant") - } - - fn end(self) -> Result { - unimplemented!("Avro enums does not support data in its variant") - } -} - -// tuple enum variant -impl ser::SerializeTupleVariant for Unsupported { - type Ok = Value; - type Error = AvrowErr; - - fn serialize_field(&mut self, _: &T) -> Result<(), Self::Error> - where - T: Serialize, - { - unimplemented!("Avro enums does not support Rust tuple variants in enums") - } - - fn end(self) -> Result { - unimplemented!("Avro enums does not support Rust tuple variant in enums") - } -} - -// TODO maybe we can map it by looking at the schema -impl ser::SerializeTupleStruct for Unsupported { - type Ok = Value; - type Error = AvrowErr; - - fn serialize_field(&mut self, _value: &T) -> Result<(), Self::Error> - where - T: Serialize, - { - unimplemented!("Avro enums does not support Rust tuple struct") - } - - fn end(self) -> Result { - unimplemented!("Avro enums does not support Rust tuple struct") - } -} - -impl<'a> ser::SerializeTuple for Unsupported { - type Ok = Value; - type Error = AvrowErr; - - fn serialize_element(&mut self, _value: &T) -> Result<(), Self::Error> - where - T: Serialize, - { - unimplemented!("Avro enums does not support Rust tuples") - } - - fn end(self) -> Result { - unimplemented!("Avro enums does not support Rust tuples") - } -} diff --git a/src/serde_avro/ser_impl.rs b/src/serde_avro/ser_impl.rs deleted file mode 100644 index c8e9c78..0000000 --- a/src/serde_avro/ser_impl.rs +++ /dev/null @@ -1,195 +0,0 @@ -use super::SerdeWriter; -use crate::error::AvrowErr; -use crate::value::FieldValue; -use crate::value::Record; -use crate::Value; -use serde::Serialize; -use std::collections::HashMap; - -pub struct MapSerializer { - map: HashMap, -} - -impl MapSerializer { - pub fn new(len: Option) -> Self { - let map = match len { - Some(len) => HashMap::with_capacity(len), - None => HashMap::new(), - }; - - MapSerializer { map } - } -} - -impl serde::ser::SerializeMap for MapSerializer { - type Ok = Value; - type Error = AvrowErr; - - fn serialize_entry( - &mut self, - key: &K, - value: &V, - ) -> Result<(), Self::Error> - where - K: Serialize, - V: Serialize, - { - let key = key.serialize(&mut SerdeWriter)?; - if let Value::Str(s) = key { - let value = value.serialize(&mut SerdeWriter)?; - self.map.insert(s, value); - Ok(()) - } else { - Err(AvrowErr::ExpectedString) - } - } - - fn serialize_key(&mut self, _key: &T) -> Result<(), Self::Error> - where - T: Serialize, - { - Ok(()) - } - - fn serialize_value(&mut self, _value: &T) -> Result<(), Self::Error> - where - T: Serialize, - { - Ok(()) - } - - fn end(self) -> Result { - Ok(Value::Map(self.map)) - } -} - -////////////////////////////////////////////////////////////////////////////// -/// Rust structs to avro record -////////////////////////////////////////////////////////////////////////////// -pub struct StructSerializer { - name: String, - fields: indexmap::IndexMap, -} - -impl StructSerializer { - pub fn new(name: &str, len: usize) -> StructSerializer { - StructSerializer { - name: name.to_string(), - fields: indexmap::IndexMap::with_capacity(len), - } - } -} - -impl serde::ser::SerializeStruct for StructSerializer { - type Ok = Value; - type Error = AvrowErr; - - fn serialize_field( - &mut self, - name: &'static str, - value: &T, - ) -> Result<(), Self::Error> - where - T: Serialize, - { - self.fields.insert( - name.to_owned(), - FieldValue::new(value.serialize(&mut SerdeWriter)?), - ); - Ok(()) - } - - fn end(self) -> Result { - let record = Record { - name: self.name, - fields: self.fields, - }; - Ok(Value::Record(record)) - } -} - -////////////////////////////////////////////////////////////////////////////// -/// Sequences -////////////////////////////////////////////////////////////////////////////// - -pub struct SeqSerializer { - items: Vec, -} - -impl SeqSerializer { - pub fn new(len: Option) -> SeqSerializer { - let items = match len { - Some(len) => Vec::with_capacity(len), - None => Vec::new(), - }; - - SeqSerializer { items } - } -} - -// Helper function to extract a Vec from a Vec -// This should only be called by the caller who knows that the items -// in the Vec a Value::Byte(u8). -// NOTE: Does collect on an into_iter() allocate a new vec? -fn as_byte_vec(a: Vec) -> Vec { - a.into_iter() - .map(|v| { - if let Value::Byte(b) = v { - b - } else { - unreachable!("Expecting a byte value in the Vec") - } - }) - .collect() -} - -impl<'a> serde::ser::SerializeSeq for SeqSerializer { - type Ok = Value; - type Error = AvrowErr; - - fn serialize_element(&mut self, value: &T) -> Result<(), Self::Error> - where - T: Serialize, - { - let v = value.serialize(&mut SerdeWriter)?; - self.items.push(v); - Ok(()) - } - - // If the items in vec are of Value::Byte(u8) then return a byte array. - // FIXME: maybe implement Serialize directly for Vec to avoid this way. - fn end(self) -> Result { - match self.items.first() { - Some(Value::Byte(_)) => Ok(Value::Bytes(as_byte_vec(self.items))), - _ => Ok(Value::Array(self.items)), - } - } -} - -////////////////////////////////////////////////////////////////////////////// -/// Tuples: avro bytes, fixed -////////////////////////////////////////////////////////////////////////////// - -impl<'a> serde::ser::SerializeTuple for SeqSerializer { - type Ok = Value; - type Error = AvrowErr; - - fn serialize_element(&mut self, value: &T) -> Result<(), Self::Error> - where - T: Serialize, - { - let v = value.serialize(&mut SerdeWriter)?; - self.items.push(v); - Ok(()) - } - - // If the items in vec are of Value::Byte(u8) then return a byte array. - // FIXME: maybe implement Serialize directly for Vec to avoid this way. - fn end(self) -> Result { - match self.items.first() { - Some(Value::Byte(_)) => Ok(Value::Bytes(as_byte_vec(self.items))), - Some(Value::Fixed(_)) => Ok(Value::Fixed(as_byte_vec(self.items))), - _ => Ok(Value::Array(self.items)), - } - } -} diff --git a/src/util.rs b/src/util.rs deleted file mode 100644 index 4306105..0000000 --- a/src/util.rs +++ /dev/null @@ -1,34 +0,0 @@ -use crate::error::AvrowErr; -use integer_encoding::VarIntReader; -use integer_encoding::VarIntWriter; -use std::io::{Error, ErrorKind, Read, Write}; -use std::str; - -pub(crate) fn decode_string(reader: &mut R) -> Result { - let buf = decode_bytes(reader)?; - let s = str::from_utf8(&buf).map_err(|_e| { - let err = Error::new(ErrorKind::InvalidData, "Failed decoding string from bytes"); - AvrowErr::DecodeFailed(err) - })?; - Ok(s.to_string()) -} - -pub(crate) fn decode_bytes(reader: &mut R) -> Result, AvrowErr> { - let len: i64 = reader.read_varint().map_err(AvrowErr::DecodeFailed)?; - let mut byte_buf = vec![0u8; len as usize]; - reader - .read_exact(&mut byte_buf) - .map_err(AvrowErr::DecodeFailed)?; - Ok(byte_buf) -} - -pub fn encode_long(value: i64, writer: &mut W) -> Result { - writer.write_varint(value).map_err(AvrowErr::EncodeFailed) -} - -pub fn encode_raw_bytes(value: &[u8], writer: &mut W) -> Result<(), AvrowErr> { - writer - .write(value) - .map_err(AvrowErr::EncodeFailed) - .map(|_| ()) -} diff --git a/src/value.rs b/src/value.rs deleted file mode 100644 index 46fad0f..0000000 --- a/src/value.rs +++ /dev/null @@ -1,797 +0,0 @@ -//! Represents the types that - -use crate::error::AvrowErr; -use crate::schema; -use crate::schema::common::validate_name; -use crate::schema::parser::parse_default; -use crate::schema::Registry; -use crate::util::{encode_long, encode_raw_bytes}; -use crate::Schema; -use byteorder::LittleEndian; -use byteorder::WriteBytesExt; -use indexmap::IndexMap; -use integer_encoding::VarIntWriter; -use schema::Order; -use schema::Variant; -use serde::Serialize; -use std::collections::{BTreeMap, HashMap}; -use std::fmt::Display; -use std::io::Write; - -// Convenient type alias for map initialzation. -pub type Map = HashMap; - -#[derive(Debug, Clone, PartialEq, Serialize)] -pub(crate) struct FieldValue { - pub(crate) value: Value, - #[serde(skip_serializing)] - order: schema::Order, -} - -impl FieldValue { - pub(crate) fn new(value: Value) -> Self { - FieldValue { - value, - order: Order::Ascending, - } - } -} - -#[derive(Debug, Clone, PartialEq, Serialize)] -/// The [Record](https://avro.apache.org/docs/current/spec.html#schema_record) avro type. -/// Avro records translates to a struct in Rust. Any struct that implements serde's -/// Serializable trait can be converted to an avro record. -pub struct Record { - pub(crate) name: String, - pub(crate) fields: IndexMap, -} - -impl Record { - /// Creates a new avro record type with the given name. - pub fn new(name: &str) -> Self { - Record { - fields: IndexMap::new(), - name: name.to_string(), - } - } - - /// Adds a field to the record. - pub fn insert>(&mut self, field_name: &str, ty: T) -> Result<(), AvrowErr> { - validate_name(0, field_name)?; - self.fields - .insert(field_name.to_string(), FieldValue::new(ty.into())); - Ok(()) - } - - /// Sets the ordering of the field in the record. - pub fn set_field_order(&mut self, field_name: &str, order: Order) -> Result<(), AvrowErr> { - let a = self - .fields - .get_mut(field_name) - .ok_or(AvrowErr::FieldNotFound)?; - a.order = order; - Ok(()) - } - - /// Creates a record from a [BTreeMap](https://doc.rust-lang.org/std/collections/struct.BTreeMap.html) by consuming it. - /// The values in `BTreeMap` must implement `Into`. The `name` provided must match with the name in the record - /// schema being provided to the writer. - pub fn from_btree + Ord + Display, V: Into>( - name: &str, - btree: BTreeMap, - ) -> Result { - let mut record = Record::new(name); - for (k, v) in btree { - let field_value = FieldValue { - value: v.into(), - order: Order::Ascending, - }; - record.fields.insert(k.to_string(), field_value); - } - - Ok(record) - } - - /// Creates a record from a JSON object (serde_json::Value). A confirming record schema must be provided. - pub fn from_json( - json: serde_json::Map, - schema: &Schema, - ) -> Result { - if let Variant::Record { - name, - fields: record_schema_fields, - .. - } = &schema.variant - { - let mut values = IndexMap::with_capacity(record_schema_fields.len()); - 'fields: for (k, v) in record_schema_fields { - if let Some(default_value) = json.get(k) { - if let Variant::Union { variants } = &v.ty { - for var in variants { - if let Ok(v) = parse_default(&default_value, &var) { - values.insert(k.to_string(), FieldValue::new(v)); - continue 'fields; - } - } - return Err(AvrowErr::FailedDefaultUnion); - } else { - let parsed_value = parse_default(&default_value, &v.ty)?; - values.insert(k.to_string(), FieldValue::new(parsed_value)); - } - } else if let Some(v) = &v.default { - values.insert(k.to_string(), FieldValue::new(v.clone())); - } else { - return Err(AvrowErr::FieldNotFound); - } - } - - Ok(Value::Record(crate::value::Record { - fields: values, - name: name.fullname(), - })) - } else { - Err(AvrowErr::ExpectedJsonObject) - } - } -} - -// TODO: Avro sort order -// impl PartialOrd for Value { -// fn partial_cmp(&self, other: &Self) -> Option { -// match (self, other) { -// (Value::Null, Value::Null) => Some(Ordering::Equal), -// (Value::Boolean(self_v), Value::Boolean(other_v)) => { -// if self_v == other_v { -// return Some(Ordering::Equal); -// } -// if *self_v == false && *other_v { -// Some(Ordering::Less) -// } else { -// Some(Ordering::Greater) -// } -// } -// (Value::Int(self_v), Value::Int(other_v)) => Some(self_v.cmp(other_v)), -// (Value::Long(self_v), Value::Long(other_v)) => Some(self_v.cmp(other_v)), -// (Value::Float(self_v), Value::Float(other_v)) => self_v.partial_cmp(other_v), -// (Value::Double(self_v), Value::Double(other_v)) => self_v.partial_cmp(other_v), -// (Value::Bytes(self_v), Value::Bytes(other_v)) => self_v.partial_cmp(other_v), -// (Value::Byte(self_v), Value::Byte(other_v)) => self_v.partial_cmp(other_v), -// (Value::Fixed(self_v), Value::Fixed(other_v)) => self_v.partial_cmp(other_v), -// (Value::Str(self_v), Value::Str(other_v)) => self_v.partial_cmp(other_v), -// (Value::Array(self_v), Value::Array(other_v)) => self_v.partial_cmp(other_v), -// (Value::Enum(self_v), Value::Enum(other_v)) => self_v.partial_cmp(other_v), -// (Value::Record(_self_v), Value::Record(_other_v)) => todo!(), -// _ => todo!(), -// } -// } -// } - -/// Represents an Avro value -#[derive(Debug, Clone, PartialEq, Serialize)] -pub enum Value { - /// A null value. - Null, - /// An i32 integer value. - Int(i32), - /// An i64 long value. - Long(i64), - /// A boolean value. - Boolean(bool), - /// A f32 float value. - Float(f32), - /// A f64 float value. - Double(f64), - /// A Record value (BTreeMap). - Record(Record), - /// A Fixed value. - Fixed(Vec), - /// A Map value. - Map(Map), - /// A sequence of u8 bytes. - Bytes(Vec), - /// Rust strings map directly to avro strings - Str(String), - /// A union is a sequence of unique `Value`s - Union(Box), - /// An enumeration. Unlike Rust enums, enums in avro don't support data within their variants. - Enum(String), - /// An array of `Value`s - Array(Vec), - /// auxiliary u8 helper for serde. Not an avro value. - Byte(u8), -} - -impl Value { - pub(crate) fn encode( - &self, - writer: &mut W, - schema: &Variant, - cxt: &Registry, - ) -> Result<(), AvrowErr> { - match (self, schema) { - (Value::Null, Variant::Null) => {} - (Value::Boolean(b), Variant::Boolean) => writer - .write_all(&[*b as u8]) - .map_err(AvrowErr::EncodeFailed)?, - (Value::Int(i), Variant::Int) => { - writer.write_varint(*i).map_err(AvrowErr::EncodeFailed)?; - } - // int is promotable to long, float or double --- - (Value::Int(i), Variant::Long) => { - writer - .write_varint(*i as i64) - .map_err(AvrowErr::EncodeFailed)?; - } - (Value::Int(i), Variant::Float) => { - writer - .write_f32::(*i as f32) - .map_err(AvrowErr::EncodeFailed)?; - } - (Value::Int(i), Variant::Double) => { - writer - .write_f64::(*i as f64) - .map_err(AvrowErr::EncodeFailed)?; - } - // --- - (Value::Long(l), Variant::Long) => { - writer.write_varint(*l).map_err(AvrowErr::EncodeFailed)?; - } - (Value::Long(l), Variant::Float) => { - writer - .write_f32::(*l as f32) - .map_err(AvrowErr::EncodeFailed)?; - } - (Value::Long(l), Variant::Double) => { - writer - .write_f64::(*l as f64) - .map_err(AvrowErr::EncodeFailed)?; - } - (Value::Float(f), Variant::Float) => { - writer - .write_f32::(*f) - .map_err(AvrowErr::EncodeFailed)?; - } - // float is promotable to double --- - (Value::Float(f), Variant::Double) => { - writer - .write_f64::(*f as f64) - .map_err(AvrowErr::EncodeFailed)?; - } // --- - (Value::Double(d), Variant::Double) => { - writer - .write_f64::(*d) - .map_err(AvrowErr::EncodeFailed)?; - } - (ref value, Variant::Named(name)) => { - if let Some(schema) = cxt.get(name) { - value.encode(writer, schema, cxt)?; - } - } - // Match with union happens first than more specific match arms - (ref value, Variant::Union { variants, .. }) => { - let (union_idx, schema) = resolve_union(&value, &variants, cxt)?; - let union_idx = union_idx as i32; - writer - .write_varint(union_idx) - .map_err(AvrowErr::EncodeFailed)?; - value.encode(writer, &schema, cxt)? - } - (Value::Record(ref record), Variant::Record { fields, .. }) => { - for (f_name, f_value) in &record.fields { - let field_type = fields.get(f_name); - if let Some(field_ty) = field_type { - f_value.value.encode(writer, &field_ty.ty, cxt)?; - } - } - } - (Value::Map(hmap), Variant::Map { values }) => { - // number of keys/value (start of a block) - encode_long(hmap.keys().len() as i64, writer)?; - for (k, v) in hmap.iter() { - encode_long(k.len() as i64, writer)?; - encode_raw_bytes(&*k.as_bytes(), writer)?; - v.encode(writer, values, cxt)?; - } - // marks end of block - encode_long(0, writer)?; - } - (Value::Fixed(ref v), Variant::Fixed { .. }) => { - writer.write_all(&*v).map_err(AvrowErr::EncodeFailed)?; - } - (Value::Str(s), Variant::Str) => { - encode_long(s.len() as i64, writer)?; - encode_raw_bytes(&*s.as_bytes(), writer)?; - } - // string is promotable to bytes --- - (Value::Str(s), Variant::Bytes) => { - encode_long(s.len() as i64, writer)?; - encode_raw_bytes(&*s.as_bytes(), writer)?; - } // -- - (Value::Bytes(b), Variant::Bytes) => { - encode_long(b.len() as i64, writer)?; - encode_raw_bytes(&*b, writer)?; - } - // bytes is promotable to string --- - (Value::Bytes(b), Variant::Str) => { - encode_long(b.len() as i64, writer)?; - encode_raw_bytes(&*b, writer)?; - } // --- - (Value::Bytes(b), Variant::Fixed { size: _size, .. }) => { - encode_raw_bytes(&*b, writer)?; - } - (Value::Enum(ref sym), Variant::Enum { symbols, .. }) => { - if let Some(idx) = symbols.iter().position(|r| r == sym) { - writer - .write_varint(idx as i32) - .map_err(AvrowErr::EncodeFailed)?; - } else { - return Err(AvrowErr::SchemaDataMismatch); - } - } - ( - Value::Array(ref values), - Variant::Array { - items: items_schema, - }, - ) => { - let array_items_count = Value::from(values.len() as i64); - array_items_count.encode(writer, &Variant::Long, cxt)?; - - for i in values { - i.encode(writer, items_schema, cxt)?; - } - Value::from(0i64).encode(writer, &Variant::Long, cxt)?; - } - // case where serde serializes a Vec to a Array of Byte - // FIXME:figure out a better way for this? - (Value::Array(ref values), Variant::Bytes) => { - let mut v = Vec::with_capacity(values.len()); - for i in values { - if let Value::Byte(b) = i { - v.push(*b); - } - } - encode_long(values.len() as i64, writer)?; - encode_raw_bytes(&*v, writer)?; - } - _ => return Err(AvrowErr::SchemaDataMismatch), - }; - Ok(()) - } -} - -// Given a value, returns the index and the variant of the union -fn resolve_union<'a>( - value: &Value, - union_variants: &'a [Variant], - cxt: &'a Registry, -) -> Result<(usize, &'a Variant), AvrowErr> { - for (idx, variant) in union_variants.iter().enumerate() { - match (value, variant) { - (Value::Null, Variant::Null) - | (Value::Boolean(_), Variant::Boolean) - | (Value::Int(_), Variant::Int) - | (Value::Long(_), Variant::Long) - | (Value::Float(_), Variant::Float) - | (Value::Double(_), Variant::Double) - | (Value::Bytes(_), Variant::Bytes) - | (Value::Str(_), Variant::Str) - | (Value::Map(_), Variant::Map { .. }) - | (Value::Array(_), Variant::Array { .. }) - | (Value::Fixed(_), Variant::Fixed { .. }) - | (Value::Enum(_), Variant::Enum { .. }) - | (Value::Record(_), Variant::Record { .. }) => return Ok((idx, variant)), - (Value::Array(v), Variant::Fixed { size, .. }) => { - if v.len() == *size { - return Ok((idx, variant)); - } - return Err(AvrowErr::FixedValueLenMismatch { - found: v.len(), - expected: *size, - }); - } - (Value::Union(_), _) => return Err(AvrowErr::NoImmediateUnion), - (Value::Record(_), Variant::Named(name)) => { - if let Some(schema) = cxt.get(&name) { - return Ok((idx, schema)); - } else { - return Err(AvrowErr::SchemaNotFoundInUnion); - } - } - (Value::Enum(_), Variant::Named(name)) => { - if let Some(schema) = cxt.get(&name) { - return Ok((idx, schema)); - } else { - return Err(AvrowErr::SchemaNotFoundInUnion); - } - } - (Value::Fixed(_), Variant::Named(name)) => { - if let Some(schema) = cxt.get(&name) { - return Ok((idx, schema)); - } else { - return Err(AvrowErr::SchemaNotFoundInUnion); - } - } - _a => {} - } - } - - Err(AvrowErr::SchemaNotFoundInUnion) -} - -/////////////////////////////////////////////////////////////////////////////// -/// From impls for Value -/////////////////////////////////////////////////////////////////////////////// - -impl From<()> for Value { - fn from(_v: ()) -> Value { - Value::Null - } -} - -impl From for Value { - fn from(v: String) -> Value { - Value::Str(v) - } -} - -impl> From> for Value { - fn from(v: HashMap) -> Value { - let mut map = HashMap::with_capacity(v.len()); - for (k, v) in v.into_iter() { - map.insert(k, v.into()); - } - Value::Map(map) - } -} - -impl From for Value { - fn from(value: bool) -> Value { - Value::Boolean(value) - } -} - -impl From> for Value { - fn from(value: Vec) -> Value { - Value::Bytes(value) - } -} - -impl<'a> From<&'a [u8]> for Value { - fn from(value: &'a [u8]) -> Value { - Value::Bytes(value.to_vec()) - } -} - -impl From for Value { - fn from(value: i32) -> Value { - Value::Int(value) - } -} - -impl From for Value { - fn from(value: isize) -> Value { - Value::Int(value as i32) - } -} - -impl From for Value { - fn from(value: usize) -> Value { - Value::Int(value as i32) - } -} - -impl> From> for Value { - fn from(values: Vec) -> Value { - let mut new_vec = vec![]; - for i in values { - new_vec.push(i.into()); - } - Value::Array(new_vec) - } -} - -impl From for Value { - fn from(value: i64) -> Value { - Value::Long(value) - } -} - -impl From for Value { - fn from(value: u64) -> Value { - Value::Long(value as i64) - } -} - -impl From for Value { - fn from(value: f32) -> Value { - Value::Float(value) - } -} - -impl From for Value { - fn from(value: f64) -> Value { - Value::Double(value) - } -} - -impl<'a> From<&'a str> for Value { - fn from(value: &'a str) -> Value { - Value::Str(value.to_string()) - } -} - -#[macro_export] -/// Convenient macro to create a avro fixed value -macro_rules! fixed { - ($vec:tt) => { - avrow::Value::Fixed($vec) - }; -} - -/////////////////////////////////////////////////////////////////////////////// -/// Value -> Rust value -/////////////////////////////////////////////////////////////////////////////// - -impl Value { - /// Try to retrieve an avro null - pub fn as_null(&self) -> Result<(), AvrowErr> { - if let Value::Null = self { - Ok(()) - } else { - Err(AvrowErr::ExpectedVariantNotFound) - } - } - /// Try to retrieve an avro boolean - pub fn as_boolean(&self) -> Result<&bool, AvrowErr> { - if let Value::Boolean(b) = self { - Ok(b) - } else { - Err(AvrowErr::ExpectedVariantNotFound) - } - } - /// Try to retrieve an avro int - pub fn as_int(&self) -> Result<&i32, AvrowErr> { - if let Value::Int(v) = self { - Ok(v) - } else { - Err(AvrowErr::ExpectedVariantNotFound) - } - } - /// Try to retrieve an avro long - pub fn as_long(&self) -> Result<&i64, AvrowErr> { - if let Value::Long(v) = self { - Ok(v) - } else { - Err(AvrowErr::ExpectedVariantNotFound) - } - } - /// Try to retrieve an avro float - pub fn as_float(&self) -> Result<&f32, AvrowErr> { - if let Value::Float(v) = self { - Ok(v) - } else { - Err(AvrowErr::ExpectedVariantNotFound) - } - } - /// Try to retrieve an avro double - pub fn as_double(&self) -> Result<&f64, AvrowErr> { - if let Value::Double(v) = self { - Ok(v) - } else { - Err(AvrowErr::ExpectedVariantNotFound) - } - } - /// Try to retrieve an avro bytes - pub fn as_bytes(&self) -> Result<&[u8], AvrowErr> { - if let Value::Bytes(v) = self { - Ok(v) - } else { - Err(AvrowErr::ExpectedVariantNotFound) - } - } - /// Try to retrieve an avro string - pub fn as_string(&self) -> Result<&str, AvrowErr> { - if let Value::Str(v) = self { - Ok(v) - } else { - Err(AvrowErr::ExpectedVariantNotFound) - } - } - /// Try to retrieve an avro record - pub fn as_record(&self) -> Result<&Record, AvrowErr> { - if let Value::Record(v) = self { - Ok(v) - } else { - Err(AvrowErr::ExpectedVariantNotFound) - } - } - /// Try to retrieve the variant of the enum as a string - pub fn as_enum(&self) -> Result<&str, AvrowErr> { - if let Value::Enum(v) = self { - Ok(v) - } else { - Err(AvrowErr::ExpectedVariantNotFound) - } - } - /// Try to retrieve an avro array - pub fn as_array(&self) -> Result<&[Value], AvrowErr> { - if let Value::Array(v) = self { - Ok(v) - } else { - Err(AvrowErr::ExpectedVariantNotFound) - } - } - /// Try to retrieve an avro map - pub fn as_map(&self) -> Result<&HashMap, AvrowErr> { - if let Value::Map(v) = self { - Ok(v) - } else { - Err(AvrowErr::ExpectedVariantNotFound) - } - } - /// Try to retrieve an avro union - pub fn as_union(&self) -> Result<&Value, AvrowErr> { - if let Value::Union(v) = self { - Ok(v) - } else { - Err(AvrowErr::ExpectedVariantNotFound) - } - } - /// Try to retrieve an avro fixed - pub fn as_fixed(&self) -> Result<&[u8], AvrowErr> { - if let Value::Fixed(v) = self { - Ok(v) - } else { - Err(AvrowErr::ExpectedVariantNotFound) - } - } -} - -#[cfg(test)] -mod tests { - use super::Record; - use crate::from_value; - use crate::Schema; - use crate::Value; - use serde::{Deserialize, Serialize}; - use std::collections::BTreeMap; - use std::str::FromStr; - - #[test] - fn record_from_btree() { - let mut rec = BTreeMap::new(); - rec.insert("foo", "bar"); - let _r = Record::from_btree("test", rec).unwrap(); - } - - #[derive(Debug, Serialize, Deserialize, PartialEq)] - struct SomeRecord { - one: Vec, - two: Vec, - } - - #[test] - fn named_schema_resolves() { - let schema = r##" - { - "type": "record", - "name": "SomeRecord", - "aliases": ["MyRecord"], - "fields" : [ - {"name": "one", "type":{"type": "fixed", "size": 5, "name": "md5"}}, - {"name": "two", "type":"md5"} - ] - } - "##; - - let schema = crate::Schema::from_str(schema).unwrap(); - let mut writer = crate::Writer::with_codec(&schema, vec![], crate::Codec::Null).unwrap(); - - let value = SomeRecord { - one: vec![0u8, 1, 2, 3, 4], - two: vec![0u8, 1, 2, 3, 4], - }; - - writer.serialize(&value).unwrap(); - - let output = writer.into_inner().unwrap(); - let reader = crate::Reader::new(output.as_slice()).unwrap(); - for i in reader { - let r: SomeRecord = from_value(&i).unwrap(); - assert_eq!(r, value); - } - } - - #[derive(Debug, Serialize, Deserialize)] - struct Mentees { - id: i32, - username: String, - } - - #[derive(Debug, Serialize, Deserialize)] - struct RustMentors { - name: String, - github_handle: String, - active: bool, - mentees: Mentees, - } - #[test] - fn record_from_json() { - let schema = Schema::from_str( - r##" - { - "name": "rust_mentors", - "type": "record", - "fields": [ - { - "name": "name", - "type": "string" - }, - { - "name": "github_handle", - "type": "string" - }, - { - "name": "active", - "type": "boolean" - }, - { - "name":"mentees", - "type": { - "name":"mentees", - "type": "record", - "fields": [ - {"name":"id", "type": "int"}, - {"name":"username", "type": "string"} - ] - } - } - ] - } -"##, - ) - .unwrap(); - - let json = serde_json::from_str( - r##" - { "name": "bob", - "github_handle":"ghbob", - "active": true, - "mentees":{"id":1, "username":"alice"} }"##, - ) - .unwrap(); - let rec = super::Record::from_json(json, &schema).unwrap(); - let mut writer = crate::Writer::new(&schema, vec![]).unwrap(); - writer.write(rec).unwrap(); - let avro_data = writer.into_inner().unwrap(); - let reader = crate::Reader::new(avro_data.as_slice()).unwrap(); - for value in reader { - let _mentors: RustMentors = from_value(&value).unwrap(); - } - } - - #[test] - fn record_has_fields_with_default() { - let schema_str = r##" - { - "namespace": "sensor.data", - "type": "record", - "name": "common", - "fields" : [ - {"name": "data", "type": ["null", "string"], "default": null} - ] - } -"##; - - let sample_data = r#"{ - "data": null - }"#; - - let serde_json = serde_json::from_str(sample_data).unwrap(); - let schema = Schema::from_str(schema_str).unwrap(); - let rec = Record::from_json(serde_json, &schema).unwrap(); - let field = &rec.as_record().unwrap().fields["data"]; - assert_eq!(field.value, Value::Null); - } -} diff --git a/src/writer.rs b/src/writer.rs deleted file mode 100644 index e3a7a11..0000000 --- a/src/writer.rs +++ /dev/null @@ -1,317 +0,0 @@ -//! The Writer is the primary interface for writing values in avro encoded format. - -use crate::codec::Codec; -use crate::config::{DEFAULT_FLUSH_INTERVAL, MAGIC_BYTES, SYNC_MARKER_SIZE}; -use crate::error::{AvrowErr, AvrowResult}; -use crate::schema::Registry; -use crate::schema::Schema; -use crate::schema::Variant; -use crate::serde_avro; -use crate::util::{encode_long, encode_raw_bytes}; -use crate::value::Map; -use crate::value::Value; -use rand::{thread_rng, Rng}; -use serde::Serialize; -use std::collections::HashMap; -use std::default::Default; -use std::io::Write; - -fn sync_marker() -> [u8; SYNC_MARKER_SIZE] { - let mut vec = [0u8; SYNC_MARKER_SIZE]; - thread_rng().fill_bytes(&mut vec[..]); - vec -} - -/// Convenient builder struct for configuring and instantiating a Writer. -pub struct WriterBuilder<'a, W> { - metadata: HashMap, - codec: Codec, - schema: Option<&'a Schema>, - datafile: Option, - flush_interval: usize, -} - -impl<'a, W: Write> WriterBuilder<'a, W> { - /// Creates a builder instance to construct a Writer. - pub fn new() -> Self { - WriterBuilder { - metadata: Default::default(), - codec: Codec::Null, - schema: None, - datafile: None, - flush_interval: DEFAULT_FLUSH_INTERVAL, - } - } - - /// Set any custom metadata for the datafile. - pub fn set_metadata(mut self, k: &str, v: &str) -> Self { - self.metadata - .insert(k.to_string(), Value::Bytes(v.as_bytes().to_vec())); - self - } - - /// Set one of the available codecs. This requires the respective feature flags to be enabled. - pub fn set_codec(mut self, codec: Codec) -> Self { - self.codec = codec; - self - } - - /// Provide the writer with a reference to the schema file. - pub fn set_schema(mut self, schema: &'a Schema) -> Self { - self.schema = Some(schema); - self - } - - /// Set the underlying output stream. This can be any type that implements the `Write` trait. - pub fn set_datafile(mut self, w: W) -> Self { - self.datafile = Some(w); - self - } - - /// Set the flush interval (in bytes) for the internal buffer. It's the amount of bytes post which - /// the internal buffer is written to the underlying datafile or output stream.. - /// Defaults to [`DEFAULT_FLUSH_INTERVAL`](config/constant.DEFAULT_FLUSH_INTERVAL.html). - pub fn set_flush_interval(mut self, interval: usize) -> Self { - self.flush_interval = interval; - self - } - - /// Builds the `Writer` instance consuming this builder. - pub fn build(self) -> AvrowResult> { - let mut writer = Writer { - out_stream: self.datafile.ok_or(AvrowErr::WriterBuildFailed)?, - schema: self.schema.ok_or(AvrowErr::WriterBuildFailed)?, - block_stream: Vec::with_capacity(self.flush_interval), - block_count: 0, - codec: self.codec, - sync_marker: sync_marker(), - flush_interval: self.flush_interval, - }; - writer.encode_custom_header(self.metadata)?; - Ok(writer) - } -} - -impl<'a, W: Write> Default for WriterBuilder<'a, W> { - fn default() -> Self { - Self::new() - } -} - -/// The Writer is the primary interface for writing values to an avro datafile or a byte container (say a `Vec`). -/// It takes a reference to the schema for validating the values being written -/// and an output stream `W` which can be any type -/// implementing the [Write](https://doc.rust-lang.org/std/io/trait.Write.html) trait. -pub struct Writer<'a, W> { - out_stream: W, - schema: &'a Schema, - block_stream: Vec, - block_count: usize, - codec: Codec, - sync_marker: [u8; 16], - flush_interval: usize, -} - -impl<'a, W: Write> Writer<'a, W> { - /// Creates a new avro `Writer` instance taking a reference to a `Schema` - /// and a type implementing [`Write`](https://doc.rust-lang.org/std/io/trait.Write.html). - pub fn new(schema: &'a Schema, out_stream: W) -> AvrowResult { - let mut writer = Writer { - out_stream, - schema, - block_stream: Vec::with_capacity(DEFAULT_FLUSH_INTERVAL), - block_count: 0, - codec: Codec::Null, - sync_marker: sync_marker(), - flush_interval: DEFAULT_FLUSH_INTERVAL, - }; - writer.encode_header()?; - Ok(writer) - } - - /// Same as the `new` method, but additionally takes a `Codec` as parameter. - /// Codecs can be used to compress the encoded data being written in an avro datafile. - /// Supported codecs as per spec are: - /// * null (default): No compression is applied. - /// * [snappy](https://en.wikipedia.org/wiki/Snappy_(compression)) (`--features snappy`) - /// * [deflate](https://en.wikipedia.org/wiki/DEFLATE) (`--features deflate`) - /// * [zstd](https://facebook.github.io/zstd/) compression (`--feature zstd`) - /// * [bzip](http://www.bzip.org/) compression (`--feature bzip`) - /// * [xz](https://tukaani.org/xz/) compression (`--features xz`) - pub fn with_codec(schema: &'a Schema, out_stream: W, codec: Codec) -> AvrowResult { - let mut writer = Writer { - out_stream, - schema, - block_stream: Vec::with_capacity(DEFAULT_FLUSH_INTERVAL), - block_count: 0, - codec, - sync_marker: sync_marker(), - flush_interval: DEFAULT_FLUSH_INTERVAL, - }; - writer.encode_header()?; - Ok(writer) - } - - /// Appends a value to the buffer. - /// Before a value gets written, it gets validated with the schema referenced - /// by this writer. - /// - /// # Note: - /// writes are buffered internally as per the flush interval (for performance) and the underlying - /// buffer may not reflect values immediately. - /// Call [`flush`](struct.Writer.html#method.flush) to explicitly write all buffered data. - /// Alternatively calling [`into_inner`](struct.Writer.html#method.into_inner) on the writer - /// guarantees that flush will happen and will hand over - /// the underlying buffer with all data written. - pub fn write>(&mut self, value: T) -> AvrowResult<()> { - let val: Value = value.into(); - self.schema.validate(&val)?; - - val.encode( - &mut self.block_stream, - &self.schema.variant(), - &self.schema.cxt, - )?; - self.block_count += 1; - - if self.block_stream.len() >= self.flush_interval { - self.flush()?; - } - - Ok(()) - } - - /// Appends a native Rust value to the buffer. The value must implement Serde's `Serialize` trait. - pub fn serialize(&mut self, value: T) -> AvrowResult<()> { - let value = serde_avro::to_value(&value)?; - self.write(value)?; - Ok(()) - } - - fn reset_block_buffer(&mut self) { - self.block_count = 0; - self.block_stream.clear(); - } - - /// Sync/flush any buffered data to the underlying buffer. - pub fn flush(&mut self) -> AvrowResult<()> { - // bail if no data is written or it has already been flushed before - if self.block_count == 0 { - return Ok(()); - } - // encode datum count - encode_long(self.block_count as i64, &mut self.out_stream)?; - // encode with codec - self.codec - .encode(&mut self.block_stream, &mut self.out_stream)?; - // Write sync marker - encode_raw_bytes(&self.sync_marker, &mut self.out_stream)?; - // Reset block buffer - self.out_stream.flush().map_err(AvrowErr::EncodeFailed)?; - self.reset_block_buffer(); - Ok(()) - } - - // Used via WriterBuilder - fn encode_custom_header(&mut self, mut map: HashMap) -> AvrowResult<()> { - self.out_stream - .write(MAGIC_BYTES) - .map_err(AvrowErr::EncodeFailed)?; - map.insert("avro.schema".to_string(), self.schema.as_bytes().into()); - let codec_str = self.codec.as_ref().as_bytes(); - map.insert("avro.codec".to_string(), codec_str.into()); - let meta_schema = &Variant::Map { - values: Box::new(Variant::Bytes), - }; - - Value::Map(map).encode(&mut self.out_stream, meta_schema, &Registry::new())?; - encode_raw_bytes(&self.sync_marker, &mut self.out_stream)?; - Ok(()) - } - - fn encode_header(&mut self) -> AvrowResult<()> { - self.out_stream - .write(MAGIC_BYTES) - .map_err(AvrowErr::EncodeFailed)?; - // encode metadata - let mut metamap = Map::with_capacity(2); - metamap.insert("avro.schema".to_string(), self.schema.as_bytes().into()); - let codec_str = self.codec.as_ref().as_bytes(); - metamap.insert("avro.codec".to_string(), codec_str.into()); - let meta_schema = &Variant::Map { - values: Box::new(Variant::Bytes), - }; - - Value::Map(metamap).encode(&mut self.out_stream, meta_schema, &Registry::new())?; - encode_raw_bytes(&self.sync_marker, &mut self.out_stream)?; - Ok(()) - } - - /// Consumes self and yields the inner `Write` instance. - /// Additionally calls `flush` if no flush has happened before this call. - pub fn into_inner(mut self) -> AvrowResult { - self.flush()?; - Ok(self.out_stream) - } -} - -#[cfg(test)] -mod tests { - use crate::{from_value, Codec, Reader, Schema, Writer, WriterBuilder}; - use std::io::Cursor; - use std::str::FromStr; - - #[test] - fn header_written_on_writer_creation() { - let schema = Schema::from_str(r##""null""##).unwrap(); - let v = Cursor::new(vec![]); - let writer = Writer::new(&schema, v).unwrap(); - let buf = writer.into_inner().unwrap().into_inner(); - // writer. - let slice = &buf[0..4]; - - assert_eq!(slice[0], b'O'); - assert_eq!(slice[1], b'b'); - assert_eq!(slice[2], b'j'); - assert_eq!(slice[3], 1); - } - - #[test] - fn writer_with_builder() { - let schema = Schema::from_str(r##""null""##).unwrap(); - let v = vec![]; - let mut writer = WriterBuilder::new() - .set_codec(Codec::Null) - .set_schema(&schema) - .set_datafile(v) - .set_flush_interval(128_000) - .build() - .unwrap(); - writer.serialize(()).unwrap(); - let _v = writer.into_inner().unwrap(); - - let reader = Reader::with_schema(_v.as_slice(), &schema).unwrap(); - for i in reader { - let _: () = from_value(&i).unwrap(); - } - } - - #[test] - fn custom_metadata_header() { - let schema = Schema::from_str(r##""null""##).unwrap(); - let v = vec![]; - let mut writer = WriterBuilder::new() - .set_codec(Codec::Null) - .set_schema(&schema) - .set_datafile(v) - .set_flush_interval(128_000) - .set_metadata("hello", "world") - .build() - .unwrap(); - writer.serialize(()).unwrap(); - let _v = writer.into_inner().unwrap(); - - let reader = Reader::with_schema(_v.as_slice(), &schema).unwrap(); - assert!(reader.meta().contains_key("hello")); - } -} diff --git a/tests/common.rs b/tests/common.rs deleted file mode 100644 index a90260c..0000000 --- a/tests/common.rs +++ /dev/null @@ -1,102 +0,0 @@ -#![allow(dead_code)] - -use avrow::Codec; -use avrow::Schema; -use avrow::{Reader, Writer}; -use std::io::Cursor; -use std::str::FromStr; - -#[derive(Debug)] -pub(crate) enum Primitive { - Null, - Boolean, - Int, - Long, - Float, - Double, - Bytes, - String, -} - -impl std::fmt::Display for Primitive { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - use Primitive::*; - let str_repr = match self { - Null => "null", - Boolean => "boolean", - Int => "int", - Long => "long", - Float => "float", - Double => "double", - Bytes => "bytes", - String => "string", - }; - write!(f, "{}", str_repr) - } -} - -pub(crate) fn writer_from_schema<'a>(schema: &'a Schema, codec: Codec) -> Writer<'a, Vec> { - let writer = Writer::with_codec(&schema, vec![], codec).unwrap(); - writer -} - -pub(crate) fn reader_with_schema<'a>(schema: &Schema, buffer: Vec) -> Reader>> { - let reader = Reader::with_schema(Cursor::new(buffer), schema).unwrap(); - reader -} - -#[allow(dead_code)] -pub(crate) fn to_file(path: &str, buffer: &[u8]) { - use std::io::Write; - let mut f = std::fs::OpenOptions::new() - .create(true) - .truncate(true) - .write(true) - .open(path) - .unwrap(); - f.write_all(&buffer).unwrap(); -} - -pub(crate) struct MockSchema; -impl MockSchema { - // creates a primitive schema - pub fn prim(self, ty: &str) -> Schema { - let schema_str = format!("{{\"type\": \"{}\"}}", ty); - Schema::from_str(&schema_str).unwrap() - } - - pub fn record(self) -> Schema { - Schema::from_str( - r#" - { - "type": "record", - "name": "LongList", - "aliases": ["LinkedLongs"], - "fields" : [ - {"name": "value", "type": "long"}, - {"name": "next", "type": ["null", "LongList"]} - ] - } - "#, - ) - .unwrap() - } - - pub fn record_default(self) -> Schema { - Schema::from_str( - r#" - { - "type": "record", - "name": "LongList", - "aliases": ["LinkedLongs"], - "fields" : [ - {"name": "value", "type": "long"}, - {"name": "next", "type": ["null", "LongList"]}, - {"name": "other", "type":"long", "default": 1} - ] - } - "#, - ) - .unwrap() - } -} diff --git a/tests/read_write.rs b/tests/read_write.rs deleted file mode 100644 index 49a620d..0000000 --- a/tests/read_write.rs +++ /dev/null @@ -1,472 +0,0 @@ -extern crate pretty_env_logger; -extern crate serde_derive; - -mod common; - -use crate::common::{writer_from_schema, MockSchema}; -use avrow::{from_value, Codec, Reader, Schema, Value}; -use std::collections::HashMap; -use std::str::FromStr; - -use common::Primitive; -use serde_derive::{Deserialize, Serialize}; - -const DATUM_COUNT: usize = 10000; - -/////////////////////////////////////////////////////////////////////////////// -/// Primitive schema tests -/////////////////////////////////////////////////////////////////////////////// - -// #[cfg(feature = "codec")] -static PRIMITIVES: [Primitive; 8] = [ - Primitive::Null, - Primitive::Boolean, - Primitive::Int, - Primitive::Long, - Primitive::Float, - Primitive::Double, - Primitive::Bytes, - Primitive::String, -]; - -// static PRIMITIVES: [Primitive; 1] = [Primitive::Int]; - -#[cfg(feature = "codec")] -const CODECS: [Codec; 6] = [ - Codec::Null, - Codec::Deflate, - Codec::Snappy, - Codec::Zstd, - Codec::Bzip2, - Codec::Xz, -]; - -// #[cfg(feature = "bzip2")] -// const CODECS: [Codec; 1] = [Codec::Bzip2]; - -#[test] -#[cfg(feature = "codec")] -fn read_write_primitive() { - for codec in CODECS.iter() { - for primitive in PRIMITIVES.iter() { - // write - let name = &format!("{}", primitive); - let schema = MockSchema.prim(name); - let mut writer = writer_from_schema(&schema, *codec); - (0..DATUM_COUNT).for_each(|i| match primitive { - Primitive::Null => { - writer.write(()).unwrap(); - } - Primitive::Boolean => { - writer.write(i % 2 == 0).unwrap(); - } - Primitive::Int => { - writer.write(std::i32::MAX).unwrap(); - } - Primitive::Long => { - writer.write(std::i64::MAX).unwrap(); - } - Primitive::Float => { - writer.write(std::f32::MAX).unwrap(); - } - Primitive::Double => { - writer.write(std::f64::MAX).unwrap(); - } - Primitive::Bytes => { - writer.write(vec![b'a', b'v', b'r', b'o', b'w']).unwrap(); - } - Primitive::String => { - writer.write("avrow").unwrap(); - } - }); - - let buf = writer.into_inner().unwrap(); - - // read - let schema = MockSchema.prim(name); - let reader = Reader::with_schema(buf.as_slice(), &schema).unwrap(); - for i in reader { - match primitive { - Primitive::Null => { - let _: () = from_value(&i).unwrap(); - } - Primitive::Boolean => { - let _: bool = from_value(&i).unwrap(); - } - Primitive::Int => { - let _: i32 = from_value(&i).unwrap(); - } - Primitive::Long => { - let _: i64 = from_value(&i).unwrap(); - } - Primitive::Float => { - let _: f32 = from_value(&i).unwrap(); - } - Primitive::Double => { - let _: f64 = from_value(&i).unwrap(); - } - Primitive::Bytes => { - let _: &[u8] = from_value(&i).unwrap(); - } - Primitive::String => { - let _: &str = from_value(&i).unwrap(); - } - } - } - } - } -} - -/////////////////////////////////////////////////////////////////////////////// -/// Complex schema tests -/////////////////////////////////////////////////////////////////////////////// - -#[derive(Debug, Serialize, Deserialize)] -struct LongList { - value: i64, - next: Option>, -} - -#[test] -#[cfg(feature = "codec")] -fn io_read_write_self_referential_record() { - // write - for codec in CODECS.iter() { - let schema = r##" - { - "type": "record", - "name": "LongList", - "aliases": ["LinkedLongs"], - "fields" : [ - {"name": "value", "type": "long"}, - {"name": "next", "type": ["null", "LongList"]} - ] - } - "##; - - let schema = Schema::from_str(schema).unwrap(); - let mut writer = writer_from_schema(&schema, *codec); - for _ in 0..1 { - let value = LongList { - value: 1i64, - next: Some(Box::new(LongList { - value: 2, - next: Some(Box::new(LongList { - value: 3, - next: None, - })), - })), - }; - // let value = LongList { - // value: 1i64, - // next: None, - // }; - writer.serialize(value).unwrap(); - } - - let buf = writer.into_inner().unwrap(); - - // read - let reader = Reader::with_schema(buf.as_slice(), &schema).unwrap(); - for i in reader { - let _: LongList = from_value(&i).unwrap(); - } - } -} - -#[derive(Serialize, Deserialize)] -enum Suit { - SPADES, - HEARTS, - DIAMONDS, - CLUBS, -} - -#[test] -#[cfg(feature = "codec")] -fn enum_read_write() { - // write - for codec in CODECS.iter() { - let schema = r##" - { - "type": "enum", - "name": "Suit", - "symbols" : ["SPADES", "HEARTS", "DIAMONDS", "CLUBS"] - } - "##; - - let schema = Schema::from_str(schema).unwrap(); - let mut writer = writer_from_schema(&schema, *codec); - for _ in 0..1 { - let value = Suit::SPADES; - writer.serialize(value).unwrap(); - } - - let buf = writer.into_inner().unwrap(); - - // read - let reader = Reader::with_schema(buf.as_slice(), &schema).unwrap(); - for i in reader { - let _: Suit = from_value(&i).unwrap(); - } - } -} - -#[test] -#[cfg(feature = "codec")] -fn array_read_write() { - // write - for codec in CODECS.iter() { - let schema = r##" - {"type": "array", "items": "string"} - "##; - - let schema = Schema::from_str(schema).unwrap(); - let mut writer = writer_from_schema(&schema, *codec); - for _ in 0..DATUM_COUNT { - let value = vec!["a", "v", "r", "o", "w"]; - writer.serialize(value).unwrap(); - } - - let buf = writer.into_inner().unwrap(); - - // read - let reader = Reader::with_schema(buf.as_slice(), &schema).unwrap(); - for i in reader { - let _: Vec<&str> = from_value(&i).unwrap(); - } - } -} - -#[test] -#[cfg(feature = "codec")] -fn map_read_write() { - // write - for codec in CODECS.iter() { - let schema = r##" - {"type": "map", "values": "long"} - "##; - - let schema = Schema::from_str(schema).unwrap(); - let mut writer = writer_from_schema(&schema, *codec); - for _ in 0..DATUM_COUNT { - let mut value = HashMap::new(); - value.insert("foo", 1i64); - value.insert("bar", 2); - writer.serialize(value).unwrap(); - } - - let buf = writer.into_inner().unwrap(); - - // read - let reader = Reader::with_schema(buf.as_slice(), &schema).unwrap(); - for i in reader { - let _: HashMap = from_value(&i).unwrap(); - } - } -} - -#[test] -#[cfg(feature = "codec")] -fn union_read_write() { - // write - for codec in CODECS.iter() { - let schema = r##" - ["null", "string"] - "##; - - let schema = Schema::from_str(schema).unwrap(); - let mut writer = writer_from_schema(&schema, *codec); - for _ in 0..1 { - writer.serialize(()).unwrap(); - writer.serialize("hello".to_string()).unwrap(); - } - - let buf = writer.into_inner().unwrap(); - - // read - let reader = Reader::with_schema(buf.as_slice(), &schema).unwrap(); - for i in reader { - let val = i.as_ref().unwrap(); - match val { - Value::Null => { - let _a: () = from_value(&i).unwrap(); - } - Value::Str(_) => { - let _a: &str = from_value(&i).unwrap(); - } - _ => unreachable!("should not happen"), - } - } - } -} - -#[test] -#[cfg(feature = "codec")] -fn fixed_read_write() { - // write - for codec in CODECS.iter() { - let schema = r##" - {"type": "fixed", "size": 16, "name": "md5"} - "##; - - let schema = Schema::from_str(schema).unwrap(); - let mut writer = writer_from_schema(&schema, *codec); - for _ in 0..1 { - let value = vec![ - b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'a', b'b', b'c', b'd', b'e', - b'f', b'g', - ]; - writer.serialize(value.as_slice()).unwrap(); - } - - let buf = writer.into_inner().unwrap(); - - // read - let reader = Reader::with_schema(buf.as_slice(), &schema).unwrap(); - for i in reader { - let a: [u8; 16] = from_value(&i).unwrap(); - assert_eq!(a.len(), 16); - } - } -} - -#[test] -#[cfg(feature = "codec")] -fn bytes_read_write() { - let schema = Schema::from_str(r##"{"type": "bytes"}"##).unwrap(); - let mut writer = writer_from_schema(&schema, avrow::Codec::Deflate); - let data = vec![0u8, 1u8, 2u8, 3u8, 4u8, 5u8]; - writer.serialize(&data).unwrap(); - - let buf = writer.into_inner().unwrap(); - - let reader = Reader::with_schema(buf.as_slice(), &schema).unwrap(); - for i in reader { - let b: &[u8] = from_value(&i).unwrap(); - assert_eq!(b, &[0u8, 1u8, 2u8, 3u8, 4u8, 5u8]); - } -} - -#[test] -#[should_panic] -#[cfg(feature = "codec")] -fn write_invalid_union_data_fails() { - let schema = Schema::from_str(r##"["int", "float"]"##).unwrap(); - let mut writer = writer_from_schema(&schema, avrow::Codec::Null); - writer.serialize("string").unwrap(); -} - -#[test] -#[cfg(feature = "snappy")] -fn read_deflate_reuse() { - let schema = Schema::from_str( - r##" - { - "type": "record", - "name": "LongList", - "aliases": ["LinkedLongs"], - "fields" : [ - {"name": "value", "type": "long"}, - {"name": "next", "type": ["null", "LongList"]} - ] - } - "##, - ) - .unwrap(); - let vec = vec![]; - let mut writer = avrow::Writer::with_codec(&schema, vec, Codec::Snappy).unwrap(); - for _ in 0..100000 { - let value = LongList { - value: 1i64, - next: Some(Box::new(LongList { - value: 2i64, - next: Some(Box::new(LongList { - value: 3i64, - next: Some(Box::new(LongList { - value: 4i64, - next: Some(Box::new(LongList { - value: 5i64, - next: None, - })), - })), - })), - })), - }; - writer.serialize(value).unwrap(); - } - let vec = writer.into_inner().unwrap(); - - let reader = Reader::new(&*vec).unwrap(); - for i in reader { - let _v: LongList = from_value(&i).unwrap(); - } -} - -#[test] -fn parses_field_record_defined_within_union() { - #[derive(Serialize, Deserialize, PartialEq, Debug)] - struct Reference { - #[serde(rename = "feedReference")] - pub feed_reference: Option, - } - - #[derive(Debug, PartialEq, Clone, Deserialize, Serialize)] - pub struct FeedReference { - pub instance: String, - pub provider: String, - } - - impl Default for FeedReference { - fn default() -> FeedReference { - FeedReference { - instance: String::default(), - provider: String::default(), - } - } - } - - let schema = r##" - { - "name": "Reference", - "type": "record", - "fields": [ - { - "name": "feedReference", - "type": [ - "null", - { - "name": "FeedReference", - "type": "record", - "fields": [ - { - "name": "instance", - "type": "string" - }, - { - "name": "provider", - "type": "string" - } - ] - } - ], - "default": null - } - ] - } - "##; - - let reference = Reference { - feed_reference: Some(FeedReference::default()), - }; - - let schema = Schema::from_str(&schema).unwrap(); - let mut writer = avrow::Writer::new(&schema, vec![]).unwrap(); - writer.serialize(&reference).unwrap(); - let a = writer.into_inner().unwrap(); - let reader = Reader::new(a.as_slice()).unwrap(); - for i in reader { - let a: Reference = from_value(&i).unwrap(); - assert_eq!(a, reference); - } -} diff --git a/tests/schema_resolution.rs b/tests/schema_resolution.rs deleted file mode 100644 index bcff617..0000000 --- a/tests/schema_resolution.rs +++ /dev/null @@ -1,315 +0,0 @@ -/// Tests for schema resolution -mod common; - -use serde::{Deserialize, Serialize}; - -use avrow::{from_value, Codec, Reader, Schema, Value}; -use std::collections::HashMap; -use std::str::FromStr; - -use common::{reader_with_schema, writer_from_schema, MockSchema}; - -#[test] -#[should_panic] -fn null_fails_with_other_primitive_schema() { - let name = "null"; - let schema = MockSchema.prim(name); - let mut writer = writer_from_schema(&schema, Codec::Null); - writer.serialize(()).unwrap(); - writer.flush().unwrap(); - - let buf = writer.into_inner().unwrap(); - - let reader_schema = MockSchema.prim("boolean"); - let reader = Reader::with_schema(buf.as_slice(), &reader_schema).unwrap(); - - for i in reader { - let _ = i.unwrap(); - } -} - -#[test] -fn writer_to_reader_promotion_primitives() { - // int -> long, float, double - for reader_schema in &["long", "float", "double"] { - let name = "int"; - let schema = MockSchema.prim(name); - let mut writer = writer_from_schema(&schema, Codec::Null); - writer.serialize(1024).unwrap(); - writer.flush().unwrap(); - - let buf = writer.into_inner().unwrap(); - - let reader_schema = MockSchema.prim(reader_schema); - let reader = Reader::with_schema(buf.as_slice(), &reader_schema).unwrap(); - for i in reader { - assert!(i.is_ok()); - let _a = i.unwrap(); - } - } - - // long -> float, double - for reader_schema in &["float", "double"] { - let name = "long"; - let schema = MockSchema.prim(name); - let mut writer = writer_from_schema(&schema, Codec::Null); - writer.serialize(1024i64).unwrap(); - writer.flush().unwrap(); - - let buf = writer.into_inner().unwrap(); - - let reader_schema = MockSchema.prim(reader_schema); - let reader = Reader::with_schema(buf.as_slice(), &reader_schema).unwrap(); - for i in reader { - assert!(i.is_ok()); - } - } - - // float -> double - for reader_schema in &["double"] { - let name = "float"; - let schema = MockSchema.prim(name); - let mut writer = writer_from_schema(&schema, Codec::Null); - writer.serialize(1026f32).unwrap(); - writer.flush().unwrap(); - - let buf = writer.into_inner().unwrap(); - - let reader_schema = MockSchema.prim(reader_schema); - let reader = Reader::with_schema(buf.as_slice(), &reader_schema).unwrap(); - for i in reader { - assert!(i.is_ok()); - } - } - - // string -> bytes - for reader_schema in &["bytes"] { - let name = "string"; - let schema = MockSchema.prim(name); - let mut writer = writer_from_schema(&schema, Codec::Null); - writer.serialize("hello").unwrap(); - writer.flush().unwrap(); - - let buf = writer.into_inner().unwrap(); - - let reader_schema = MockSchema.prim(reader_schema); - let reader = Reader::with_schema(buf.as_slice(), &reader_schema).unwrap(); - for i in reader { - assert!(i.is_ok()); - let a = i.unwrap(); - assert_eq!(Value::Bytes(vec![104, 101, 108, 108, 111]), a); - } - } - - // bytes -> string - for reader_schema in &["string"] { - let name = "bytes"; - let schema = MockSchema.prim(name); - let mut writer = writer_from_schema(&schema, Codec::Null); - writer.serialize([104u8, 101, 108, 108, 111]).unwrap(); - writer.flush().unwrap(); - - let buf = writer.into_inner().unwrap(); - - let reader_schema = MockSchema.prim(reader_schema); - let reader = Reader::with_schema(buf.as_slice(), &reader_schema).unwrap(); - for i in reader { - assert!(i.is_ok()); - let a = i.unwrap(); - assert_eq!(Value::Str("hello".to_string()), a); - } - } -} - -#[derive(Serialize, Deserialize)] -enum Foo { - A, - B, - C, - E, -} - -#[test] -#[should_panic] -fn enum_fails_schema_resolution() { - let schema = - Schema::from_str(r##"{"type": "enum", "name": "Foo", "symbols": ["A", "B", "C", "D"] }"##) - .unwrap(); - let mut writer = writer_from_schema(&schema, Codec::Null); - writer.serialize(Foo::B).unwrap(); - writer.flush().unwrap(); - - let buf = writer.into_inner().unwrap(); - - // Reading a symbol which does not exist in writer's schema should fail - let reader_schema = - Schema::from_str(r##"{"type": "enum", "name": "Foo", "symbols": ["F"] }"##).unwrap(); - let reader = Reader::with_schema(buf.as_slice(), &reader_schema).unwrap(); - - // let reader = reader_with_schema(reader_schema, name); - for i in reader { - i.unwrap(); - } -} - -#[test] -#[should_panic] -fn schema_resolution_map() { - let schema = Schema::from_str(r##"{"type": "map", "values": "string"}"##).unwrap(); - let mut writer = writer_from_schema(&schema, Codec::Null); - let mut m = HashMap::new(); - m.insert("1", "b"); - writer.serialize(m).unwrap(); - writer.flush().unwrap(); - - let buf = writer.into_inner().unwrap(); - - // // Reading a symbol which does not exist in writer's schema should fail - let reader_schema = Schema::from_str(r##"{"type": "map", "values": "int"}"##).unwrap(); - - let reader = reader_with_schema(&reader_schema, buf); - for i in reader { - let _ = i.unwrap(); - } -} - -#[derive(Serialize, Deserialize)] -struct LongList { - value: i64, - next: Option>, -} - -#[derive(Serialize, Deserialize, Debug)] -struct LongListDefault { - value: i64, - next: Option>, - other: i64, -} - -#[test] -fn record_schema_resolution_with_default_value() { - let schema = MockSchema.record(); - let mut writer = writer_from_schema(&schema, Codec::Null); - let list = LongList { - value: 1, - next: None, - }; - - writer.serialize(list).unwrap(); - - let buf = writer.into_inner().unwrap(); - - let schema = MockSchema.record_default(); - let reader = reader_with_schema(&schema, buf); - for i in reader { - let rec: Result = from_value(&i); - assert!(rec.is_ok()); - } -} - -#[test] -#[cfg(feature = "codec")] -fn writer_is_a_union_but_reader_is_not() { - let writer_schema = Schema::from_str(r##"["null", "int"]"##).unwrap(); - let mut writer = writer_from_schema(&writer_schema, Codec::Deflate); - writer.serialize(()).unwrap(); - writer.serialize(3).unwrap(); - - let buf = writer.into_inner().unwrap(); - - let schema_str = r##""int""##; - let reader_schema = Schema::from_str(schema_str).unwrap(); - let mut reader = reader_with_schema(&reader_schema, buf); - assert!(reader.next().unwrap().is_err()); - assert!(reader.next().unwrap().is_ok()); -} - -#[test] -fn reader_is_a_union_but_writer_is_not() { - let writer_schema = Schema::from_str(r##""int""##).unwrap(); - let mut writer = writer_from_schema(&writer_schema, Codec::Null); - writer.serialize(3).unwrap(); - - let buf = writer.into_inner().unwrap(); - - // err - let reader_schema = Schema::from_str(r##"["null", "string"]"##).unwrap(); - let mut reader = reader_with_schema(&reader_schema, buf.clone()); - assert!(reader.next().unwrap().is_err()); - - // ok - let reader_schema = Schema::from_str(r##"["null", "int"]"##).unwrap(); - let mut reader = reader_with_schema(&reader_schema, buf); - assert!(reader.next().unwrap().is_ok()); -} - -#[test] -fn both_are_unions_but_different() { - let writer_schema = Schema::from_str(r##"["null", "int"]"##).unwrap(); - let mut writer = writer_from_schema(&writer_schema, Codec::Null); - writer.serialize(3).unwrap(); - - let buf = writer.into_inner().unwrap(); - - let reader_schema = Schema::from_str(r##"["boolean", "string"]"##).unwrap(); - let mut reader = reader_with_schema(&reader_schema, buf); - - // err - assert!(reader.next().unwrap().is_err()); -} - -#[test] -fn both_are_map() { - let writer_schema = Schema::from_str(r##"{"type": "map", "values": "string"}"##).unwrap(); - let mut writer = writer_from_schema(&writer_schema, Codec::Null); - let mut map = HashMap::new(); - map.insert("hello", "world"); - writer.serialize(map).unwrap(); - - let buf = writer.into_inner().unwrap(); - - // let reader_schema = - // Schema::from_str(r##"["boolean", {"type":"map", "values": "string"}]"##).unwrap(); - let reader_schema = Schema::from_str(r##"{"type": "map", "values": "string"}"##).unwrap(); - let mut reader = reader_with_schema(&reader_schema, buf); - assert!(reader.next().unwrap().is_ok()); -} - -#[test] -fn both_are_arrays() { - let writer_schema = Schema::from_str(r##"{"type": "array", "items": "int"}"##).unwrap(); - let mut writer = writer_from_schema(&writer_schema, Codec::Null); - writer.serialize(vec![1, 2, 3]).unwrap(); - - let buf = writer.into_inner().unwrap(); - - let reader_schema = Schema::from_str(r##"{"type": "array", "items": "int"}"##).unwrap(); - let mut reader = reader_with_schema(&reader_schema, buf); - assert!(reader.next().unwrap().is_ok()); -} - -#[test] -fn both_are_enums() { - let writer_schema = Schema::from_str(r##"{"type": "array", "items": "int"}"##).unwrap(); - let mut writer = writer_from_schema(&writer_schema, Codec::Null); - writer.serialize(vec![1, 2, 3]).unwrap(); - - let buf = writer.into_inner().unwrap(); - - let reader_schema = Schema::from_str(r##"{"type": "array", "items": "int"}"##).unwrap(); - let mut reader = reader_with_schema(&reader_schema, buf); - assert!(reader.next().unwrap().is_ok()); -} - -#[test] -fn null() { - let writer_schema = Schema::from_str(r##"{"type": "null"}"##).unwrap(); - let mut writer = writer_from_schema(&writer_schema, Codec::Null); - writer.serialize(()).unwrap(); - - let buf = writer.into_inner().unwrap(); - - let reader_schema = Schema::from_str(r##"{"type": "null"}"##).unwrap(); - let mut reader = reader_with_schema(&reader_schema, buf); - assert!(reader.next().unwrap().is_ok()); -}