diff --git a/.kokoro/requirements.in b/.kokoro/requirements.in index cbd7e77f..ec867d9f 100644 --- a/.kokoro/requirements.in +++ b/.kokoro/requirements.in @@ -1,10 +1,10 @@ gcp-docuploader -gcp-releasetool +gcp-releasetool>=1.10.5 # required for compatibility with cryptography>=39.x importlib-metadata typing-extensions twine wheel setuptools -nox +nox>=2022.11.21 # required to remove dependency on py charset-normalizer<3 click<8.1.0 diff --git a/.kokoro/requirements.txt b/.kokoro/requirements.txt index 05dc4672..a2639539 100644 --- a/.kokoro/requirements.txt +++ b/.kokoro/requirements.txt @@ -1,6 +1,6 @@ # -# This file is autogenerated by pip-compile with python 3.10 -# To update, run: +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: # # pip-compile --allow-unsafe --generate-hashes requirements.in # @@ -113,36 +113,31 @@ commonmark==0.9.1 \ --hash=sha256:452f9dc859be7f06631ddcb328b6919c67984aca654e5fefb3914d54691aed60 \ --hash=sha256:da2f38c92590f83de410ba1a3cbceafbc74fee9def35f9251ba9a971d6d66fd9 # via rich -cryptography==38.0.3 \ - --hash=sha256:068147f32fa662c81aebab95c74679b401b12b57494872886eb5c1139250ec5d \ - --hash=sha256:06fc3cc7b6f6cca87bd56ec80a580c88f1da5306f505876a71c8cfa7050257dd \ - --hash=sha256:25c1d1f19729fb09d42e06b4bf9895212292cb27bb50229f5aa64d039ab29146 \ - --hash=sha256:402852a0aea73833d982cabb6d0c3bb582c15483d29fb7085ef2c42bfa7e38d7 \ - --hash=sha256:4e269dcd9b102c5a3d72be3c45d8ce20377b8076a43cbed6f660a1afe365e436 \ - --hash=sha256:5419a127426084933076132d317911e3c6eb77568a1ce23c3ac1e12d111e61e0 \ - --hash=sha256:554bec92ee7d1e9d10ded2f7e92a5d70c1f74ba9524947c0ba0c850c7b011828 \ - --hash=sha256:5e89468fbd2fcd733b5899333bc54d0d06c80e04cd23d8c6f3e0542358c6060b \ - --hash=sha256:65535bc550b70bd6271984d9863a37741352b4aad6fb1b3344a54e6950249b55 \ - --hash=sha256:6ab9516b85bebe7aa83f309bacc5f44a61eeb90d0b4ec125d2d003ce41932d36 \ - --hash=sha256:6addc3b6d593cd980989261dc1cce38263c76954d758c3c94de51f1e010c9a50 \ - --hash=sha256:728f2694fa743a996d7784a6194da430f197d5c58e2f4e278612b359f455e4a2 \ - --hash=sha256:785e4056b5a8b28f05a533fab69febf5004458e20dad7e2e13a3120d8ecec75a \ - --hash=sha256:78cf5eefac2b52c10398a42765bfa981ce2372cbc0457e6bf9658f41ec3c41d8 \ - --hash=sha256:7f836217000342d448e1c9a342e9163149e45d5b5eca76a30e84503a5a96cab0 \ - --hash=sha256:8d41a46251bf0634e21fac50ffd643216ccecfaf3701a063257fe0b2be1b6548 \ - --hash=sha256:984fe150f350a3c91e84de405fe49e688aa6092b3525f407a18b9646f6612320 \ - --hash=sha256:9b24bcff7853ed18a63cfb0c2b008936a9554af24af2fb146e16d8e1aed75748 \ - --hash=sha256:b1b35d9d3a65542ed2e9d90115dfd16bbc027b3f07ee3304fc83580f26e43249 \ - --hash=sha256:b1b52c9e5f8aa2b802d48bd693190341fae201ea51c7a167d69fc48b60e8a959 \ - --hash=sha256:bbf203f1a814007ce24bd4d51362991d5cb90ba0c177a9c08825f2cc304d871f \ - --hash=sha256:be243c7e2bfcf6cc4cb350c0d5cdf15ca6383bbcb2a8ef51d3c9411a9d4386f0 \ - --hash=sha256:bfbe6ee19615b07a98b1d2287d6a6073f734735b49ee45b11324d85efc4d5cbd \ - --hash=sha256:c46837ea467ed1efea562bbeb543994c2d1f6e800785bd5a2c98bc096f5cb220 \ - --hash=sha256:dfb4f4dd568de1b6af9f4cda334adf7d72cf5bc052516e1b2608b683375dd95c \ - --hash=sha256:ed7b00096790213e09eb11c97cc6e2b757f15f3d2f85833cd2d3ec3fe37c1722 - # via - # gcp-releasetool - # secretstorage +cryptography==39.0.1 \ + --hash=sha256:0f8da300b5c8af9f98111ffd512910bc792b4c77392a9523624680f7956a99d4 \ + --hash=sha256:35f7c7d015d474f4011e859e93e789c87d21f6f4880ebdc29896a60403328f1f \ + --hash=sha256:4789d1e3e257965e960232345002262ede4d094d1a19f4d3b52e48d4d8f3b885 \ + --hash=sha256:5aa67414fcdfa22cf052e640cb5ddc461924a045cacf325cd164e65312d99502 \ + --hash=sha256:5d2d8b87a490bfcd407ed9d49093793d0f75198a35e6eb1a923ce1ee86c62b41 \ + --hash=sha256:6687ef6d0a6497e2b58e7c5b852b53f62142cfa7cd1555795758934da363a965 \ + --hash=sha256:6f8ba7f0328b79f08bdacc3e4e66fb4d7aab0c3584e0bd41328dce5262e26b2e \ + --hash=sha256:706843b48f9a3f9b9911979761c91541e3d90db1ca905fd63fee540a217698bc \ + --hash=sha256:807ce09d4434881ca3a7594733669bd834f5b2c6d5c7e36f8c00f691887042ad \ + --hash=sha256:83e17b26de248c33f3acffb922748151d71827d6021d98c70e6c1a25ddd78505 \ + --hash=sha256:96f1157a7c08b5b189b16b47bc9db2332269d6680a196341bf30046330d15388 \ + --hash=sha256:aec5a6c9864be7df2240c382740fcf3b96928c46604eaa7f3091f58b878c0bb6 \ + --hash=sha256:b0afd054cd42f3d213bf82c629efb1ee5f22eba35bf0eec88ea9ea7304f511a2 \ + --hash=sha256:c5caeb8188c24888c90b5108a441c106f7faa4c4c075a2bcae438c6e8ca73cef \ + --hash=sha256:ced4e447ae29ca194449a3f1ce132ded8fcab06971ef5f618605aacaa612beac \ + --hash=sha256:d1f6198ee6d9148405e49887803907fe8962a23e6c6f83ea7d98f1c0de375695 \ + --hash=sha256:e124352fd3db36a9d4a21c1aa27fd5d051e621845cb87fb851c08f4f75ce8be6 \ + --hash=sha256:e422abdec8b5fa8462aa016786680720d78bdce7a30c652b7fadf83a4ba35336 \ + --hash=sha256:ef8b72fa70b348724ff1218267e7f7375b8de4e8194d1636ee60510aae104cd0 \ + --hash=sha256:f0c64d1bd842ca2633e74a1a28033d139368ad959872533b1bab8c80e8240a0c \ + --hash=sha256:f24077a3b5298a5a06a8e0536e3ea9ec60e4c7ac486755e5fb6e6ea9b3500106 \ + --hash=sha256:fdd188c8a6ef8769f148f88f859884507b954cc64db6b52f66ef199bb9ad660a \ + --hash=sha256:fe913f20024eb2cb2f323e42a64bdf2911bb9738a15dba7d3cce48151034e3a8 + # via gcp-releasetool distlib==0.3.6 \ --hash=sha256:14bad2d9b04d3a36127ac97f30b12a19268f211063d8f8ee4f47108896e11b46 \ --hash=sha256:f35c4b692542ca110de7ef0bea44d73981caeb34ca0b9b6b2e6d7790dda8f80e @@ -159,9 +154,9 @@ gcp-docuploader==0.6.4 \ --hash=sha256:01486419e24633af78fd0167db74a2763974765ee8078ca6eb6964d0ebd388af \ --hash=sha256:70861190c123d907b3b067da896265ead2eeb9263969d6955c9e0bb091b5ccbf # via -r requirements.in -gcp-releasetool==1.10.0 \ - --hash=sha256:72a38ca91b59c24f7e699e9227c90cbe4dd71b789383cb0164b088abae294c83 \ - --hash=sha256:8c7c99320208383d4bb2b808c6880eb7a81424afe7cdba3c8d84b25f4f0e097d +gcp-releasetool==1.10.5 \ + --hash=sha256:174b7b102d704b254f2a26a3eda2c684fd3543320ec239baf771542a2e58e109 \ + --hash=sha256:e29d29927fe2ca493105a82958c6873bb2b90d503acac56be2c229e74de0eec9 # via -r requirements.in google-api-core==2.10.2 \ --hash=sha256:10c06f7739fe57781f87523375e8e1a3a4674bf6392cd6131a3222182b971320 \ @@ -278,12 +273,6 @@ jaraco-classes==3.2.3 \ --hash=sha256:2353de3288bc6b82120752201c6b1c1a14b058267fa424ed5ce5984e3b922158 \ --hash=sha256:89559fa5c1d3c34eff6f631ad80bb21f378dbcbb35dd161fd2c6b93f5be2f98a # via keyring -jeepney==0.8.0 \ - --hash=sha256:5efe48d255973902f6badc3ce55e2aa6c5c3b3bc642059ef3a91247bcfcc5806 \ - --hash=sha256:c0a454ad016ca575060802ee4d590dd912e35c122fa04e70306de3d076cce755 - # via - # keyring - # secretstorage jinja2==3.1.2 \ --hash=sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852 \ --hash=sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61 @@ -340,9 +329,9 @@ more-itertools==9.0.0 \ --hash=sha256:250e83d7e81d0c87ca6bd942e6aeab8cc9daa6096d12c5308f3f92fa5e5c1f41 \ --hash=sha256:5a6257e40878ef0520b1803990e3e22303a41b5714006c32a3fd8304b26ea1ab # via jaraco-classes -nox==2022.8.7 \ - --hash=sha256:1b894940551dc5c389f9271d197ca5d655d40bdc6ccf93ed6880e4042760a34b \ - --hash=sha256:96cca88779e08282a699d672258ec01eb7c792d35bbbf538c723172bce23212c +nox==2022.11.21 \ + --hash=sha256:0e41a990e290e274cb205a976c4c97ee3c5234441a8132c8c3fd9ea3c22149eb \ + --hash=sha256:e21c31de0711d1274ca585a2c5fde36b1aa962005ba8e9322bf5eeed16dcd684 # via -r requirements.in packaging==21.3 \ --hash=sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb \ @@ -385,10 +374,6 @@ protobuf==3.20.3 \ # gcp-docuploader # gcp-releasetool # google-api-core -py==1.11.0 \ - --hash=sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719 \ - --hash=sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378 - # via nox pyasn1==0.4.8 \ --hash=sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d \ --hash=sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba @@ -453,10 +438,6 @@ rsa==4.9 \ --hash=sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7 \ --hash=sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21 # via google-auth -secretstorage==3.3.3 \ - --hash=sha256:2403533ef369eca6d2ba81718576c5e0f564d5cca1b58f73a8b23e7d4eeebd77 \ - --hash=sha256:f356e6628222568e3af06f2eba8df495efa13b3b63081dafd4f7d9a7b7bc9f99 - # via keyring six==1.16.0 \ --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 @@ -465,9 +446,9 @@ six==1.16.0 \ # gcp-docuploader # google-auth # python-dateutil -twine==4.0.1 \ - --hash=sha256:42026c18e394eac3e06693ee52010baa5313e4811d5a11050e7d48436cf41b9e \ - --hash=sha256:96b1cf12f7ae611a4a40b6ae8e9570215daff0611828f5fe1f37a16255ab24a0 +twine==4.0.2 \ + --hash=sha256:929bc3c280033347a00f847236564d1c52a3e61b1ac2516c97c48f3ceab756d8 \ + --hash=sha256:9e102ef5fdd5a20661eb88fad46338806c3bd32cf1db729603fe3697b1bc83c8 # via -r requirements.in typing-extensions==4.4.0 \ --hash=sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa \ @@ -497,7 +478,7 @@ zipp==3.10.0 \ # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: -setuptools==65.5.1 \ - --hash=sha256:d0b9a8433464d5800cbe05094acf5c6d52a91bfac9b52bcfc4d41382be5d5d31 \ - --hash=sha256:e197a19aa8ec9722928f2206f8de752def0e4c9fc6953527360d1c36d94ddb2f +setuptools==67.4.0 \ + --hash=sha256:e5fd0a713141a4a105412233c63dc4e17ba0090c8e8334594ac790ec97792330 \ + --hash=sha256:f106dee1b506dee5102cc3f3e9e68137bbad6d47b616be7991714b0c62204251 # via -r requirements.in diff --git a/CHANGELOG.md b/CHANGELOG.md index 87a345b5..38b1ced0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,19 @@ # Changelog +## [0.4.0-alpha](https://github.com/googleapis/python-documentai-toolbox/compare/v0.3.0-alpha...v0.4.0-alpha) (2023-03-09) + + +### Features + +* Add config based annotation converter ([#72](https://github.com/googleapis/python-documentai-toolbox/issues/72)) ([735514e](https://github.com/googleapis/python-documentai-toolbox/commit/735514e9120698487c47a7ec1107fb6f48c26ce1)) +* Added Batch creation for Cloud Storage documents. ([#66](https://github.com/googleapis/python-documentai-toolbox/issues/66)) ([c32a371](https://github.com/googleapis/python-documentai-toolbox/commit/c32a371696047389b5baafe317d4c51449c6d7e9)) +* Added list_gcs_document_tree ([#75](https://github.com/googleapis/python-documentai-toolbox/issues/75)) ([d18d1dc](https://github.com/googleapis/python-documentai-toolbox/commit/d18d1dc9a4c6cbd36b7a918ab26a9e229230747f)) + + +### Bug Fixes + +* Handle Edge Case where GCS Shards are out of order ([#69](https://github.com/googleapis/python-documentai-toolbox/issues/69)) ([709fe86](https://github.com/googleapis/python-documentai-toolbox/commit/709fe86dc883ee3dd2c250e1da936c9e5b77b1b9)) + ## [0.3.0-alpha](https://github.com/googleapis/python-documentai-toolbox/compare/v0.2.1-alpha...v0.3.0-alpha) (2023-02-27) diff --git a/docs/documentai_toolbox/utilities.rst b/docs/documentai_toolbox/utilities.rst new file mode 100644 index 00000000..d6ecbe9f --- /dev/null +++ b/docs/documentai_toolbox/utilities.rst @@ -0,0 +1,7 @@ +Document AI Toolbox Utilities +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: google.cloud.documentai_toolbox.utilities.utilities + :members: + :private-members: + :noindex: diff --git a/docs/index.rst b/docs/index.rst index e9a9f899..222fbe7a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -6,6 +6,7 @@ API Reference :maxdepth: 2 documentai_toolbox/wrappers + documentai_toolbox/utilities Changelog --------- diff --git a/google/cloud/documentai_toolbox/__init__.py b/google/cloud/documentai_toolbox/__init__.py index 9d1770b8..6c93a1c8 100644 --- a/google/cloud/documentai_toolbox/__init__.py +++ b/google/cloud/documentai_toolbox/__init__.py @@ -25,12 +25,11 @@ ) from .converters import ( - converters, + converter, ) -__all__ = ( - document, - page, - entity, - converters, +from .utilities import ( + utilities, ) + +__all__ = (document, page, entity, converter, utilities) diff --git a/google/cloud/documentai_toolbox/constants.py b/google/cloud/documentai_toolbox/constants.py index caa71098..5facde23 100644 --- a/google/cloud/documentai_toolbox/constants.py +++ b/google/cloud/documentai_toolbox/constants.py @@ -20,3 +20,19 @@ JSON_MIMETYPE = "application/json" FILE_CHECK_REGEX = r"(.*[.].*$)" + +# https://cloud.google.com/document-ai/quotas +BATCH_MAX_FILES = 50 +# 1GB in Bytes +BATCH_MAX_FILE_SIZE = 1073741824 +BATCH_MAX_REQUESTS = 5 + +# https://cloud.google.com/document-ai/docs/file-types +VALID_MIME_TYPES = { + "application/pdf", + "image/bmp" "image/gif", + "image/jpeg", + "image/png", + "image/tiff", + "image/webp", +} diff --git a/google/cloud/documentai_toolbox/converters/config/__init__.py b/google/cloud/documentai_toolbox/converters/config/__init__.py new file mode 100644 index 00000000..89a37dc9 --- /dev/null +++ b/google/cloud/documentai_toolbox/converters/config/__init__.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/google/cloud/documentai_toolbox/converters/config/bbox_conversion.py b/google/cloud/documentai_toolbox/converters/config/bbox_conversion.py new file mode 100644 index 00000000..3b9bab01 --- /dev/null +++ b/google/cloud/documentai_toolbox/converters/config/bbox_conversion.py @@ -0,0 +1,296 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Callable +from intervaltree import intervaltree + +from google.cloud import documentai +from google.cloud.documentai_v1.types import geometry + + +def _midpoint_in_bpoly( + box_a: geometry.BoundingPoly, box_b: geometry.BoundingPoly +) -> bool: + """Returns whether the midpoint in box_a is inside box_b.""" + + # Calculate the midpoint of box_a. + mid_x_a = (_get_norm_x_max(box_a) + _get_norm_x_min(box_a)) / 2.0 + mid_y_a = (_get_norm_y_max(box_a) + _get_norm_y_min(box_a)) / 2.0 + + max_x_b = _get_norm_x_max(box_b) + min_x_b = _get_norm_x_min(box_b) + max_y_b = _get_norm_y_max(box_b) + min_y_b = _get_norm_y_min(box_b) + + return min_x_b < mid_x_a < max_x_b and min_y_b < mid_y_a < max_y_b + + +def _merge_text_anchors( + text_anchor_1: documentai.Document.TextAnchor, + text_anchor_2: documentai.Document.TextAnchor, +) -> documentai.Document.TextAnchor: + """Merges two TextAnchor objects into one ascending sorted TextAnchor.""" + merged_text_anchor = documentai.Document.TextAnchor() + intervals = [] + for text_segment in text_anchor_1.text_segments: + intervals.append( + intervaltree.Interval(text_segment.start_index, text_segment.end_index) + ) + for text_segment in text_anchor_2.text_segments: + intervals.append( + intervaltree.Interval(text_segment.start_index, text_segment.end_index) + ) + + interval_tree = intervaltree.IntervalTree(intervals) + interval_tree.merge_overlaps(strict=False) + ts = [] + for iv in sorted(interval_tree): + ts.append( + documentai.Document.TextAnchor.TextSegment( + start_index=iv.begin, end_index=iv.end + ) + ) + + merged_text_anchor.text_segments = ts + return merged_text_anchor + + +def _get_text_anchor_in_bbox( + bbox: documentai.BoundingPoly, + page: documentai.Document.Page, + token_in_bounding_box_function: Callable[ + [documentai.BoundingPoly, documentai.BoundingPoly], bool + ] = _midpoint_in_bpoly, +) -> documentai.Document.TextAnchor: + """Gets mergedTextAnchor of Tokens in `page` that fall inside the `bbox`.""" + + text_anchor = documentai.Document.TextAnchor() + for token in page.tokens: + if token_in_bounding_box_function(token.layout.bounding_poly, bbox): + text_anchor = _merge_text_anchors(text_anchor, token.layout.text_anchor) + return text_anchor + + +def _get_norm_x_max(bbox: geometry.BoundingPoly) -> float: + return max([vertex.x for vertex in bbox.normalized_vertices]) + + +def _get_norm_x_min(bbox: geometry.BoundingPoly) -> float: + return min([vertex.x for vertex in bbox.normalized_vertices]) + + +def _get_norm_y_max(bbox: geometry.BoundingPoly) -> float: + return max([vertex.y for vertex in bbox.normalized_vertices]) + + +def _get_norm_y_min(bbox: geometry.BoundingPoly) -> float: + return min([vertex.y for vertex in bbox.normalized_vertices]) + + +def _normalize_coordinates(x, y) -> float: + return round(float(x / y), 9) + + +def _convert_to_pixels(x: float, conversion_rate: float) -> float: + return x * conversion_rate + + +def _convert_bbox_units( + coordinate, input_bbox_units, width=None, height=None, multiplier=1 +) -> float: + r"""Returns a converted coordinate. + + Args: + coordinate (float): + Required.The coordinate from document.proto + input_bbox_units (str): + Required. The bounding box units. + width (float): + Optional. + height (float): + Optional. + multiplier (float): + Optional. + + Returns: + float: + A converted coordinate. + + """ + final_coordinate = coordinate + if input_bbox_units != "normalized": + if input_bbox_units == "pxl": + if width is None: + final_coordinate = _normalize_coordinates(coordinate, height) + else: + final_coordinate = _normalize_coordinates(coordinate, width) + if input_bbox_units == "inch": + x = _convert_to_pixels(coordinate, 96) + if width is None: + final_coordinate = _normalize_coordinates(x, height) + else: + final_coordinate = _normalize_coordinates(x, width) + if input_bbox_units == "cm": + x = _convert_to_pixels(coordinate, 37.795) + if width is None: + final_coordinate = _normalize_coordinates(x, height) + else: + final_coordinate = _normalize_coordinates(x, width) + + return final_coordinate * multiplier + + +def _get_multiplier( + docproto_coordinate: float, external_coordinate: float, input_bbox_units: str +) -> float: + r"""Returns a multiplier to use when converting bounding boxes. + + Args: + docproto_coordinate (float): + Required.The coordinate from document.proto + external_coordinate (float): + Required.The coordinate from external annotations. + input_bbox_units (str): + Required. The bounding box units. + Returns: + float: + multiplier to use when converting bounding boxes. + + """ + if input_bbox_units == "inch": + converted = _convert_to_pixels(external_coordinate, 96) + return docproto_coordinate / converted + elif input_bbox_units == "cm": + converted = _convert_to_pixels(external_coordinate, 37.795) + return docproto_coordinate / converted + else: + return docproto_coordinate / external_coordinate + + +def _convert_bbox_to_docproto_bbox(block) -> geometry.BoundingPoly: + r"""Returns a converted bounding box from Block. + + Args: + block (Block): + Required. + Returns: + geometry.BoundingPoly: + A geometry.BoundingPoly from bounding box. + + """ + merged_bbox = geometry.BoundingPoly() + x_multiplier = 1 + y_multiplier = 1 + coordinates = [] + nv = [] + + # _convert_bbox_units should check if external_bbox is list or not + coordinates_object = block.bounding_box + if coordinates_object == []: + return coordinates_object + + if block.page_width and block.page_height: + x_multiplier = _get_multiplier( + docproto_coordinate=block.docproto_width, + external_coordinate=block.page_width, + input_bbox_units=block.bounding_unit, + ) + y_multiplier = _get_multiplier( + docproto_coordinate=block.docproto_height, + external_coordinate=block.page_height, + input_bbox_units=block.bounding_unit, + ) + + if block.bounding_type == "1": + # Type 1 : bounding box has 4 (x,y) coordinates + + if type(block.bounding_box) == list: + for coordinate in coordinates_object: + x = _convert_bbox_units( + coordinate[f"{block.bounding_x}"], + input_bbox_units=block.bounding_unit, + width=block.docproto_width, + multiplier=x_multiplier, + ) + y = _convert_bbox_units( + coordinate[f"{block.bounding_y}"], + input_bbox_units=block.bounding_unit, + height=block.docproto_height, + multiplier=y_multiplier, + ) + + coordinates.append({"x": x, "y": y}) + + coordinates_object = coordinates + + elif block.bounding_type == "2": + # Type 2 : bounding box has 1 (x,y) coordinates for the top left corner + # and (width, height) + original_x = coordinates_object[f"{block.bounding_x}"] + original_y = coordinates_object[f"{block.bounding_y}"] + + x = _convert_bbox_units( + original_x, + input_bbox_units=block.bounding_unit, + width=block.page_width, + multiplier=x_multiplier, + ) + y = _convert_bbox_units( + original_y, + input_bbox_units=block.bounding_unit, + width=block.page_height, + multiplier=y_multiplier, + ) + + # x_min_y_min + coordinates.append({"x": x, "y": y}) + # x_max_y_min + coordinates.append({"x": (x + block.bounding_width), "y": y}) + # x_max_y_max + coordinates.append( + {"x": (x + block.bounding_width), "y": (y + block.bounding_height)} + ) + # x_min_y_max + coordinates.append({"x": x, "y": (y + block.bounding_height)}) + + coordinates_object = coordinates + elif block.bounding_type == "3": + # Type 2 : bounding box has 1 (x,y) coordinates for the top left corner + # and (width, height) + for idx in range(0, len(block.bounding_box), 2): + x = _convert_bbox_units( + block.bounding_box[idx], + input_bbox_units=block.bounding_unit, + width=block.docproto_width, + multiplier=x_multiplier, + ) + y = _convert_bbox_units( + block.bounding_box[idx + 1], + input_bbox_units=block.bounding_unit, + width=block.docproto_height, + multiplier=y_multiplier, + ) + + coordinates.append({"x": x, "y": y}) + + coordinates_object = coordinates + + for coordinates in coordinates_object: + nv.append(documentai.NormalizedVertex(x=coordinates["x"], y=coordinates["y"])) + + merged_bbox.normalized_vertices = nv + + return merged_bbox diff --git a/google/cloud/documentai_toolbox/converters/config/blocks.py b/google/cloud/documentai_toolbox/converters/config/blocks.py new file mode 100644 index 00000000..1dd89bf3 --- /dev/null +++ b/google/cloud/documentai_toolbox/converters/config/blocks.py @@ -0,0 +1,279 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import dataclasses +from typing import List +import json +from types import SimpleNamespace + +from google.cloud import documentai + + +@dataclasses.dataclass +class Block: + r"""Represents a Block from OCR data. + + Attributes: + bounding_box (str): + Required. + block_references: + Optional. + block_id: + Optional. + confidence: + Optional. + type_: + Required. + text: + Required. + page_number: + Optional. + """ + bounding_box: dataclasses.field(init=True, repr=False, default=None) + block_references: dataclasses.field(init=False, repr=False, default=None) + block_id: dataclasses.field(init=False, repr=False, default=None) + confidence: dataclasses.field(init=False, repr=False, default=None) + type_: dataclasses.field(init=True, repr=False, default=None) + text: dataclasses.field(init=True, repr=False, default=None) + page_number: dataclasses.field(init=False, repr=False, default=None) + page_width: dataclasses.field(init=False, repr=False, default=None) + page_height: dataclasses.field(init=False, repr=False, default=None) + bounding_width: dataclasses.field(init=False, repr=False, default=None) + bounding_height: dataclasses.field(init=False, repr=False, default=None) + bounding_type: dataclasses.field(init=False, repr=False, default=None) + bounding_unit: dataclasses.field(init=False, repr=False, default=None) + bounding_x: dataclasses.field(init=False, repr=False, default=None) + bounding_y: dataclasses.field(init=False, repr=False, default=None) + docproto_width: dataclasses.field(init=False, repr=False, default=None) + docproto_height: dataclasses.field(init=False, repr=False, default=None) + + @classmethod + def create( + self, + type_, + text, + bounding_box=None, + block_references=None, + block_id=None, + confidence=None, + page_number=None, + page_width=None, + page_height=None, + bounding_width=None, + bounding_height=None, + bounding_type=None, + bounding_unit=None, + bounding_x=None, + bounding_y=None, + docproto_width=None, + docproto_height=None, + ): + return Block( + bounding_box=bounding_box, + block_references=block_references, + block_id=block_id, + confidence=confidence, + type_=type_, + text=text, + page_number=page_number, + page_width=page_width, + page_height=page_height, + bounding_width=bounding_width, + bounding_height=bounding_height, + bounding_type=bounding_type, + bounding_unit=bounding_unit, + bounding_x=bounding_x, + bounding_y=bounding_y, + docproto_width=docproto_width, + docproto_height=docproto_height, + ) + + +def _get_target_object(json_data: any, target_object: str) -> SimpleNamespace: + r"""Returns SimpleNamespace of target_object. + + Args: + json_data (str): + Required. data from JSON.loads . + target_object (str): + Required. The path to the target object. + + Returns: + SimpleNamespace. + + """ + json_data_s = SimpleNamespace(**json_data) + + target_object_parts = target_object.split(".") + + if not hasattr(json_data_s, target_object_parts[0]): + return None + + current_object = json_data_s + for part in target_object_parts: + if type(current_object) == dict: + current_object = SimpleNamespace(**current_object) + elif type(current_object) == list and part.isnumeric(): + current_object = current_object[int(part)] + continue + current_object = getattr(current_object, part) + return current_object + + +def _load_blocks_from_schema( + input_data: bytes, input_config: bytes, base_docproto: documentai.Document +) -> List[Block]: + r"""Loads Blocks from original annotation data and provided config. + + Args: + input_data (bytes): + Required.The bytes of the annotated data. + input_config (bytes): + Required.The bytes of config data. + base_docproto (bytes): + Required. The bytes of the original pdf. + + Returns: + List[Block]: + From original annotation data and provided config. + + """ + objects = json.loads(input_data) + schema_json = json.loads(input_config, object_hook=lambda d: SimpleNamespace(**d)) + + entities = schema_json.entity_object + type_ = schema_json.entity.type_ + + mention_text = schema_json.entity.mention_text + + document_height = None + document_width = None + + id_ = schema_json.entity.id if hasattr(schema_json.entity, "id") else None + if hasattr(schema_json, "page"): + document_height = ( + schema_json.page.height if hasattr(schema_json.page, "height") else None + ) + document_width = ( + schema_json.page.width if hasattr(schema_json.page, "width") else None + ) + + confidence = ( + schema_json.entity.confidence + if hasattr(schema_json.entity, "confidence") + else None + ) + page_number = ( + schema_json.entity.page_number + if hasattr(schema_json.entity, "page_number") + else None + ) + normalized_vertices = ( + schema_json.entity.normalized_vertices.base + if hasattr(schema_json.entity.normalized_vertices, "base") + else None + ) + bounding_width = ( + schema_json.entity.normalized_vertices.width + if hasattr(schema_json.entity.normalized_vertices, "width") + else None + ) + bounding_height = ( + schema_json.entity.normalized_vertices.height + if hasattr(schema_json.entity.normalized_vertices, "height") + else None + ) + bounding_type = ( + schema_json.entity.normalized_vertices.type + if hasattr(schema_json.entity.normalized_vertices, "type") + else None + ) + bounding_unit = ( + schema_json.entity.normalized_vertices.unit + if hasattr(schema_json.entity.normalized_vertices, "unit") + else None + ) + bounding_x = ( + schema_json.entity.normalized_vertices.x + if hasattr(schema_json.entity.normalized_vertices, "x") + else None + ) + bounding_y = ( + schema_json.entity.normalized_vertices.y + if hasattr(schema_json.entity.normalized_vertices, "y") + else None + ) + + blocks = [] + ens = _get_target_object(objects, entities) + for i in ens: + entity = i + + block_text = "" + + if type_ == f"{entities}:self": + block_type = i + entity = _get_target_object(objects, f"{entities}.{i}") + else: + block_type = _get_target_object(entity, type_) + + if "||" in mention_text: + text_commands = mention_text.split("||") + for command in text_commands: + if command in entity: + block_text = _get_target_object(entity, command) + continue + else: + block_text = _get_target_object(entity, mention_text) + + b = Block.create( + type_=block_type, + text=block_text, + ) + + b.bounding_box = _get_target_object(entity, normalized_vertices) + + if id_: + b.id_ = _get_target_object(entity, id_) + if confidence: + b.confidence = _get_target_object(entity, confidence) + if page_number and page_number in entity: + b.page_number = _get_target_object(entity, page_number) + if bounding_width: + b.bounding_width = _get_target_object(b.bounding_box, bounding_width) + if bounding_height: + b.bounding_height = _get_target_object(b.bounding_box, bounding_height) + if document_height: + b.page_height = _get_target_object(objects, document_height) + if document_width: + b.page_width = _get_target_object(objects, document_width) + if bounding_type: + b.bounding_type = bounding_type + if bounding_unit: + b.bounding_unit = bounding_unit + if bounding_x: + b.bounding_x = bounding_x + if bounding_y: + b.bounding_y = bounding_y + + if b.page_number is None: + b.page_number = 0 + + b.docproto_width = base_docproto.pages[int(b.page_number)].dimension.width + b.docproto_height = base_docproto.pages[int(b.page_number)].dimension.height + + blocks.append(b) + return blocks diff --git a/google/cloud/documentai_toolbox/converters/config/converter_helpers.py b/google/cloud/documentai_toolbox/converters/config/converter_helpers.py new file mode 100644 index 00000000..b6788d71 --- /dev/null +++ b/google/cloud/documentai_toolbox/converters/config/converter_helpers.py @@ -0,0 +1,534 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import re +import time +from concurrent import futures +from typing import List, Tuple + +from google.cloud.documentai_toolbox.converters.config.bbox_conversion import ( + _convert_bbox_to_docproto_bbox, + _get_text_anchor_in_bbox, +) +from google.cloud.documentai_toolbox.converters.config.blocks import ( + Block, + _load_blocks_from_schema, +) + +from google.cloud.documentai_toolbox import document, constants +from google.cloud import documentai, storage + + +def _get_base_ocr( + project_id: str, location: str, processor_id: str, file_bytes: bytes, mime_type: str +) -> documentai.Document: + r"""Returns documentai.Document from OCR processor. + + Args: + project_id (str): + Required. + location (str): + Required. + processor_id (str): + Required. + file_bytes (bytes): + Required. The bytes of the original pdf. + mime_type (str): + Required. usually "application/pdf". + Returns: + documentai.Document: + A documentai.Document from OCR processor. + + """ + + client = documentai.DocumentProcessorServiceClient() + + name = client.processor_path(project_id, location, processor_id) + + # Load Binary Data into Document AI RawDocument Object + raw_document = documentai.RawDocument(content=file_bytes, mime_type=mime_type) + + # Configure the process request + request = documentai.ProcessRequest(name=name, raw_document=raw_document) + + result = client.process_document(request=request) + return result.document + + +def _get_entity_content( + blocks: List[Block], docproto: documentai.Document +) -> List[documentai.Document.Entity]: + r"""Returns a list of documentai.Document entities. + + Args: + blocks (List[Block]): + Required.List of blocks from original annotation. + docproto (documentai.Document): + Required.The ocr docproto. + Returns: + List[documentai.Document.Entity]: + A list of documentai.Document entities. + """ + entities = [] + entity_id = 0 + + for block in blocks: + + docai_entity = documentai.Document.Entity() + if block.confidence: + docai_entity.confidence = block.confidence + + docai_entity.type = block.type_ + docai_entity.mention_text = block.text + docai_entity.id = str(entity_id) + + entity_id += 1 + # Generates the text anchors from bounding boxes + if block.bounding_box: + # Converts external bounding box format to docproto bounding box + + b1 = _convert_bbox_to_docproto_bbox(block) + + if block.page_number: + docai_entity.text_anchor = _get_text_anchor_in_bbox( + b1, docproto.pages[int(block.page_number) - 1] + ) + else: + docai_entity.text_anchor = _get_text_anchor_in_bbox( + b1, docproto.pages[0] + ) + + docai_entity.text_anchor.content = block.text + + page_anchor = documentai.Document.PageAnchor() + page_ref = documentai.Document.PageAnchor.PageRef() + + page_ref.bounding_poly = b1 + + page_anchor.page_refs = [page_ref] + docai_entity.page_anchor = page_anchor + entities.append(docai_entity) + + return entities + + +def _convert_to_docproto_with_config( + annotated_bytes: bytes, + config_bytes: bytes, + document_bytes: bytes, + project_id: str, + location: str, + processor_id: str, + retry_number: int, + name: str = "", +) -> documentai.Document: + r"""Converts a single document to docproto. + + Args: + annotated_bytes (bytes): + Required.The bytes of the annotated data. + config_bytes (bytes): + Required.The bytes of config data. + document_bytes (bytes): + Required. The bytes of the original pdf. + project_id (str): + Required. + location (str): + Required. + processor_id (str): + Required. + retry_number (str): + Required. The number of seconds needed to wait if an error occured. + name (str): + Optional. Name of the document to be converted. This is used for logging. + + Returns: + documentai.Document: + documentai.Document object. + + TODO: Depending on input type you will need to modify load_blocks. + Depending on input format, if your annotated data is not separate from the base OCR data you will need to modify _get_entity_content + Depending on input BoundingBox, if the input BoundingBox object is like https://cloud.google.com/document-ai/docs/reference/rest/v1/Document#BoundingPoly then you will need to + modify _convert_bbox_to_docproto_bbox since the objects are different. + """ + try: + base_docproto = _get_base_ocr( + project_id=project_id, + location=location, + processor_id=processor_id, + file_bytes=document_bytes, + mime_type="application/pdf", + ) + + # Loads OCR data into Blocks + # blocks = load_blocks(ocr_object=doc_object) + blocks = _load_blocks_from_schema( + input_data=annotated_bytes, + input_config=config_bytes, + base_docproto=base_docproto, + ) + + # Gets List[documentai.Document.Entity] + entities = _get_entity_content(blocks=blocks, docproto=base_docproto) + + base_docproto.entities = entities + print("Converted : %s\r" % name, end="") + return base_docproto + + except Exception as e: + print(e) + print(f"Could Not Convert {name}\nretrying") + if retry_number == 6: + return None + else: + time.sleep(retry_number) + return _convert_to_docproto_with_config( + name=name, + annotated_bytes=annotated_bytes, + config_bytes=config_bytes, + document_bytes=document_bytes, + project_id=project_id, + location=location, + processor_id=processor_id, + retry_number=retry_number + 1, + ) + + +def _get_bytes( + bucket_name: str, + prefix: str, + annotation_file_prefix: str, + config_file_prefix: str, + config_path: str = None, +) -> List[bytes]: + r"""Downloads documents and returns them as bytes. + + Args: + bucket_name (str): + Required. The bucket name. + prefix (str): + Required. The prefix for the location of the output folder. + annotation_file_prefix (str): + Required. The prefix to search for annotation file. + config_file_prefix (str): + Required. The prefix to search for config file. + config_path (str): + Optional. The gcs path to a config file. This should be used when there is a single config file. + + Returns: + List[bytes]. + + """ + + storage_client = document._get_storage_client() + bucket = storage_client.bucket(bucket_name=bucket_name) + blobs = storage_client.list_blobs(bucket_or_name=bucket_name, prefix=prefix) + + metadata_blob = None + + try: + for blob in blobs: + if "DS_Store" in blob.name: + continue + if not blob.name.endswith("/"): + blob_name = blob.name + file_name = blob_name.split("/")[-1] + if annotation_file_prefix in file_name: + annotation_blob = blob + elif config_file_prefix in file_name: + metadata_blob = blob + elif "pdf" in file_name: + doc_blob = blob + + if metadata_blob and config_path: + metadata_blob = bucket.get_blob(config_path) + + print("Downloaded : %s\r" % prefix.split("/")[-1], end="") + return [ + annotation_blob.download_as_bytes(), + doc_blob.download_as_bytes(), + metadata_blob.download_as_bytes(), + prefix.split("/")[-1], + file_name.split(".")[0], + ] + except Exception as e: + raise e + + +def _upload_file( + bucket_name: str, + output_prefix: str, + file: str, +) -> None: + r"""Uploads the converted docproto to gcs. + + Args: + bucket_name (str): + Required. The bucket name. + output_prefix (str): + Required. The prefix for the location of the output folder. + file (str): + Required. The docproto file in string format. + + Returns: + None. + + """ + storage_client = document._get_storage_client() + bucket = storage_client.bucket(bucket_name) + blob = bucket.blob(output_prefix) + + print("Uploaded : %s\r" % output_prefix.split("/")[-1], end="") + blob.upload_from_string(file, content_type="application/json") + + +def _get_files( + blob_list: List[storage.blob.Blob], + input_bucket: str, + input_prefix: str, + config_path: str = None, +): + r"""Returns a list of Futures of documents as bytes. + + Args: + blob_list (List[storage.blob.Blob]): + Required. The list of Futures from _get_files. + input_bucket (str): + Required. The name of the input bucket. + input_prefix (str): + Required. The prefix for the location of the input folder. + config_path (str): + Required. The optional + Returns: + Tuple[dict, list, list]: + Converted document.proto, unique entity types and documents that were not converted. + + """ + download_pool = futures.ThreadPoolExecutor(10) + downloads = [] + prev = None + print("-------- Downloading Started --------") + for i, blob in enumerate(blob_list): + if "DS_Store" in blob.name: + continue + + file_path = blob.name.split("/") + file_path.pop() + doc_directory = file_path[-1] + file_path2 = "/".join(file_path) + if prev == doc_directory or f"{file_path2}/" == input_prefix: + continue + + download = download_pool.submit( + _get_bytes, + input_bucket, + file_path2, + "annotation", + "config", + config_path, + ) + downloads.append(download) + + prev = doc_directory + + return downloads + + +def _get_docproto_files( + f: List[futures.Future], + project_id: str, + location: str, + processor_id: str, +) -> Tuple[dict, list, list]: + r"""Returns converted document.proto, unique entity types and documents that were not converted. + + Args: + f (List[futures.Future]): + Required. The list of Futures from _get_files. + project_id (str): + Required. + location (str): + Required. + processor_id (str): + Required. + Returns: + Tuple[dict, list, list]: + Converted document.proto, unique entity types and documents that were not converted. + + """ + did_not_convert = [] + files = {} + unique_types = [] + for future in f: + blobs = future.result() + docproto = _convert_to_docproto_with_config( + annotated_bytes=blobs[0], + document_bytes=blobs[1], + config_bytes=blobs[2], + project_id=project_id, + location=location, + processor_id=processor_id, + retry_number=1, + name=blobs[3], + ) + + if docproto is None: + did_not_convert.append(f"{blobs[3]}") + continue + + for entity in docproto.entities: + if entity.type_ not in unique_types: + unique_types.append(entity.type_) + + files[blobs[3]] = str(documentai.Document.to_json(docproto)) + + return files, unique_types, did_not_convert + + +def _upload(files: dict, gcs_output_path: str) -> None: + r"""Upload converted document.proto to gcs location. + + Args: + files (dict): + Required. The document.proto files to upload. + gcs_output_path (str): + Required. The gcs path to the folder to upload the converted docproto documents to. + + Format: `gs://{bucket}/{optional_folder}` + Returns: + None. + + """ + match = re.match(r"gs://(.*?)/(.*)", gcs_output_path) + + if match is None: + raise ValueError("gcs_prefix does not match accepted format") + + output_bucket, output_prefix = match.groups() + + if output_prefix is None: + output_prefix = "/" + + file_check = re.match(constants.FILE_CHECK_REGEX, output_prefix) + + if file_check: + raise ValueError("gcs_prefix cannot contain file types") + + download_pool = futures.ThreadPoolExecutor(10) + uploads = [] + print("-------- Uploading Started --------") + for i, key in enumerate(files): + op = output_prefix.split("/") + op.pop() + if "config" not in key and "annotations" not in key: + upload = download_pool.submit( + _upload_file, + output_bucket, + f"{output_prefix}/{key}.json", + files[key], + ) + uploads.append(upload) + + futures.wait(uploads) + + +def _convert_documents_with_config( + gcs_input_path: str, + gcs_output_path: str, + project_id: str, + location: str, + processor_id: str, + config_path: str = None, +) -> None: + r"""Converts all documents in gcs_path to docproto. + + Args: + gcs_input_path (str): + Required. The gcs path to the folder containing all non docproto documents. + + Format: `gs://{bucket}/{optional_folder}` + gcs_output_path (str): + Required. The gcs path to the folder to upload the converted docproto documents to. + + Format: `gs://{bucket}/{optional_folder}` + project_id (str): + Required. + location (str): + Required. + processor_id (str): + Required. + config_path: + Optional. The gcs path to a single config file. This will work if all the documents in gcs_input_path are of the same config type. + + Format: `gs://{bucket}/{optional_folder}/config.json` + + Returns: + None. + + """ + match = re.match(r"gs://(.*?)/(.*)", gcs_input_path) + + if match is None: + raise ValueError("gcs_prefix does not match accepted format") + + input_bucket, input_prefix = match.groups() + + if input_prefix is None: + input_prefix = "/" + + file_check = re.match(constants.FILE_CHECK_REGEX, input_prefix) + + if file_check: + raise ValueError("gcs_prefix cannot contain file types") + + storage_client = document._get_storage_client() + + blob_list = storage_client.list_blobs(input_bucket, prefix=input_prefix) + + downloads = _get_files( + blob_list=blob_list, + input_prefix=input_prefix, + input_bucket=input_bucket, + config_path=config_path, + ) + + f, _ = futures.wait(downloads) + + print("-------- Finished Downloading --------") + + print("-------- Converting Started --------") + + files = [] + did_not_convert = [] + labels = [] + + files, labels, did_not_convert = _get_docproto_files( + f, project_id, location, processor_id + ) + + print("-------- Finished Converting --------") + if did_not_convert != []: + print(f"Did not convert {len(did_not_convert)} documents") + print(did_not_convert) + + _upload(files, gcs_output_path) + + print("-------- Finished Uploading --------") + print("-------- Schema Information --------") + print(f"Unique Entity Types: {labels}") + + +# [min,min],[max,min],[max,max],[min,max] diff --git a/google/cloud/documentai_toolbox/converters/converter.py b/google/cloud/documentai_toolbox/converters/converter.py new file mode 100644 index 00000000..c80dd9cd --- /dev/null +++ b/google/cloud/documentai_toolbox/converters/converter.py @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Document.proto converters.""" + +from google.cloud.documentai_toolbox.converters.config.converter_helpers import ( + _convert_documents_with_config, +) + + +def convert_from_config( + project_id: str, + location: str, + processor_id: str, + gcs_input_path: str, + gcs_output_path: str, + config_path: str = None, +) -> None: + r"""Converts all documents in gcs_input_path to docproto using configs. + + Args: + project_id (str): + Required. + location (str): + Required. + processor_id (str): + Required. + gcs_input_path (str): + Required. The gcs path to the folder containing all non docproto documents. + + Format: `gs://{bucket}/{optional_folder}` + gcs_output_path (str): + Required. The gcs path to the folder to upload the converted docproto documents to. + + Format: `gs://{bucket}/{optional_folder}` + config_path: + Optional. The gcs path to a single config file. This will work if all the documents in gcs_input_path are of the same config type. + + Format: `gs://{bucket}/{optional_folder}/config.json` + Returns: + None. + + """ + _convert_documents_with_config( + project_id=project_id, + location=location, + processor_id=processor_id, + gcs_input_path=gcs_input_path, + gcs_output_path=gcs_output_path, + config_path=config_path, + ) diff --git a/google/cloud/documentai_toolbox/converters/converters.py b/google/cloud/documentai_toolbox/converters/converters.py deleted file mode 100644 index a00b21af..00000000 --- a/google/cloud/documentai_toolbox/converters/converters.py +++ /dev/null @@ -1,62 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -"""Document.proto converters.""" - -from typing import List -from google.cloud.vision import AnnotateFileResponse, ImageAnnotationContext -from google.cloud.vision import AnnotateImageResponse - -from google.cloud.documentai_toolbox.wrappers import page - -from google.cloud.documentai_toolbox.converters.vision_helpers import ( - _convert_document_page, - _get_text_anchor_substring, - PageInfo, -) - - -def _convert_to_vision_annotate_file_response(text: str, pages: List[page.Page]): - """Convert OCR data from Document proto to AnnotateFileResponse proto (Vision API). - - Args: - text (str): - Contents of document. - List[Page]: - A list of Pages. - - Returns: - AnnotateFileResponse proto with a TextAnnotation per page. - """ - responses = [] - vision_file_response = AnnotateFileResponse() - page_idx = 0 - while page_idx < len(pages): - page_info = PageInfo(pages[page_idx].documentai_page, text) - page_vision_annotation = _convert_document_page(page_info) - page_vision_annotation.text = _get_text_anchor_substring( - text, pages[page_idx].documentai_page.layout.text_anchor - ) - responses.append( - AnnotateImageResponse( - full_text_annotation=page_vision_annotation, - context=ImageAnnotationContext(page_number=page_idx + 1), - ) - ) - page_idx += 1 - - vision_file_response.responses = responses - - return vision_file_response diff --git a/google/cloud/documentai_toolbox/utilities/__init__.py b/google/cloud/documentai_toolbox/utilities/__init__.py new file mode 100644 index 00000000..89a37dc9 --- /dev/null +++ b/google/cloud/documentai_toolbox/utilities/__init__.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/google/cloud/documentai_toolbox/utilities/utilities.py b/google/cloud/documentai_toolbox/utilities/utilities.py new file mode 100644 index 00000000..bcaacc88 --- /dev/null +++ b/google/cloud/documentai_toolbox/utilities/utilities.py @@ -0,0 +1,174 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Document AI utilities.""" +import os +import re +from typing import Dict, List, Optional + +from google.cloud import documentai + +from google.cloud.documentai_toolbox import constants +from google.cloud.documentai_toolbox.wrappers.document import _get_storage_client + + +def list_gcs_document_tree( + gcs_bucket_name: str, gcs_prefix: str +) -> Dict[str, List[str]]: + r"""Returns a list path to files in Cloud Storage folder and prints the tree to terminal. + + Args: + gcs_bucket_name (str): + Required. The name of the gcs bucket. + + Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/` where gcs_bucket_name=`bucket`. + gcs_prefix (str): + Required. The prefix of the json files in the target_folder. + + Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/` where gcs_prefix=`{optional_folder}/{target_folder}`. + Returns: + Dict[str, List[str]]: + The paths to documents in `gs://{gcs_bucket_name}/{gcs_prefix}`. + + """ + file_check = re.match(constants.FILE_CHECK_REGEX, gcs_prefix) + + if file_check is not None: + raise ValueError("gcs_prefix cannot contain file types") + + storage_client = _get_storage_client() + blob_list = storage_client.list_blobs(gcs_bucket_name, prefix=gcs_prefix) + + path_list: Dict[str, List[str]] = {} + + for blob in blob_list: + directory, file_name = os.path.split(blob.name) + + if directory in path_list: + path_list[directory].append(file_name) + else: + path_list[directory] = [file_name] + + return path_list + + +def print_gcs_document_tree(gcs_bucket_name: str, gcs_prefix: str) -> None: + r"""Prints a tree of filenames in Cloud Storage folder.. + + Args: + gcs_bucket_name (str): + Required. The name of the gcs bucket. + + Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/` where gcs_bucket_name=`bucket`. + gcs_prefix (str): + Required. The prefix of the json files in the target_folder. + + Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/` where gcs_prefix=`{optional_folder}/{target_folder}`. + Returns: + None. + + """ + FILENAME_TREE_MIDDLE = "├──" + FILENAME_TREE_LAST = "└──" + FILES_TO_DISPLAY = 4 + + path_list = list_gcs_document_tree( + gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix + ) + + for directory, files in path_list.items(): + print(f"{directory}") + dir_size = len(files) + for idx, file_name in enumerate(files): + if idx == dir_size - 1: + if dir_size > FILES_TO_DISPLAY: + print("│ ....") + print(f"{FILENAME_TREE_LAST}{file_name}\n") + elif idx <= FILES_TO_DISPLAY: + print(f"{FILENAME_TREE_MIDDLE}{file_name}") + + +def create_batches( + gcs_bucket_name: str, + gcs_prefix: str, + batch_size: Optional[int] = constants.BATCH_MAX_FILES, +) -> List[documentai.BatchDocumentsInputConfig]: + """Create batches of documents in Cloud Storage to process with `batch_process_documents()`. + + Args: + gcs_bucket_name (str): + Required. The name of the gcs bucket. + + Format: `gs://bucket/optional_folder/target_folder/` where gcs_bucket_name=`bucket`. + gcs_prefix (str): + Required. The prefix of the json files in the `target_folder` + + Format: `gs://bucket/optional_folder/target_folder/` where gcs_prefix=`optional_folder/target_folder`. + batch_size (Optional[int]): + Optional. Size of each batch of documents. Default is `50`. + + Returns: + List[documentai.BatchDocumentsInputConfig]: + A list of `BatchDocumentsInputConfig`, each corresponding to one batch. + """ + if batch_size > constants.BATCH_MAX_FILES: + raise ValueError( + f"Batch size must be less than {constants.BATCH_MAX_FILES}. You provided {batch_size}." + ) + + storage_client = _get_storage_client() + blob_list = storage_client.list_blobs(gcs_bucket_name, prefix=gcs_prefix) + batches: List[documentai.BatchDocumentsInputConfig] = [] + batch: List[documentai.GcsDocument] = [] + + for blob in blob_list: + # Skip Directories + if blob.name.endswith("/"): + continue + + if blob.content_type not in constants.VALID_MIME_TYPES: + print(f"Skipping file {blob.name}. Invalid Mime Type {blob.content_type}.") + continue + + if blob.size > constants.BATCH_MAX_FILE_SIZE: + print( + f"Skipping file {blob.name}. File size must be less than {constants.BATCH_MAX_FILE_SIZE} bytes. File size is {blob.size} bytes." + ) + continue + + if len(batch) == batch_size: + batches.append( + documentai.BatchDocumentsInputConfig( + gcs_documents=documentai.GcsDocuments(documents=batch) + ) + ) + batch = [] + + batch.append( + documentai.GcsDocument( + gcs_uri=f"gs://{gcs_bucket_name}/{blob.name}", + mime_type=blob.content_type, + ) + ) + + if batch != []: + # Append the last batch, which could be less than `batch_size` + batches.append( + documentai.BatchDocumentsInputConfig( + gcs_documents=documentai.GcsDocuments(documents=batch) + ) + ) + + return batches diff --git a/google/cloud/documentai_toolbox/version.py b/google/cloud/documentai_toolbox/version.py index 40d78536..7f088250 100644 --- a/google/cloud/documentai_toolbox/version.py +++ b/google/cloud/documentai_toolbox/version.py @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. # -__version__ = "0.3.0-alpha" +__version__ = "0.4.0-alpha" diff --git a/google/cloud/documentai_toolbox/wrappers/document.py b/google/cloud/documentai_toolbox/wrappers/document.py index 5a9affbf..82a0c6c3 100644 --- a/google/cloud/documentai_toolbox/wrappers/document.py +++ b/google/cloud/documentai_toolbox/wrappers/document.py @@ -30,12 +30,17 @@ from google.cloud.documentai_toolbox.wrappers.page import Page from google.cloud.documentai_toolbox.wrappers.page import FormField from google.cloud.documentai_toolbox.wrappers.entity import Entity -from google.cloud.documentai_toolbox.converters.converters import ( - _convert_to_vision_annotate_file_response, -) -from google.cloud.vision import AnnotateFileResponse +from google.cloud.vision import AnnotateFileResponse, ImageAnnotationContext +from google.cloud.vision import AnnotateImageResponse + +from google.cloud.documentai_toolbox.wrappers import page +from google.cloud.documentai_toolbox.converters.vision_helpers import ( + _convert_document_page, + _get_text_anchor_substring, + PageInfo, +) from pikepdf import Pdf @@ -76,8 +81,8 @@ def _pages_from_shards(shards: List[documentai.Document]) -> List[Page]: result = [] for shard in shards: text = shard.text - for page in shard.pages: - result.append(Page(documentai_page=page, text=text)) + for shard_page in shard.pages: + result.append(Page(documentai_page=shard_page, text=text)) return result @@ -106,11 +111,11 @@ def _get_bytes(gcs_bucket_name: str, gcs_prefix: str) -> List[bytes]: gcs_bucket_name (str): Required. The name of the gcs bucket. - Format: `gs://bucket/optional_folder/target_folder/` where gcs_bucket_name=`bucket`. + Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/` where gcs_bucket_name=`bucket`. gcs_prefix (str): Required. The prefix of the json files in the target_folder - Format: `gs://bucket/optional_folder/target_folder/` where gcs_prefix=`optional_folder/target_folder`. + Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/` where gcs_prefix=`{optional_folder}/{target_folder}`. Returns: List[bytes]: A list of bytes. @@ -138,11 +143,11 @@ def _get_shards(gcs_bucket_name: str, gcs_prefix: str) -> List[documentai.Docume gcs_bucket_name (str): Required. The name of the gcs bucket. - Format: `gs://bucket/optional_folder/target_folder/` where gcs_bucket_name=`bucket`. + Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/` where gcs_bucket_name=`bucket`. gcs_prefix (str): Required. The prefix of the json files in the target_folder. - Format: `gs://bucket/optional_folder/target_folder/` where gcs_prefix=`optional_folder/target_folder`. + Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/` where gcs_prefix=`{optional_folder}/{target_folder}`. Returns: List[google.cloud.documentai.Document]: A list of documentai.Documents. @@ -160,10 +165,21 @@ def _get_shards(gcs_bucket_name: str, gcs_prefix: str) -> List[documentai.Docume for byte in byte_array: shards.append(documentai.Document.from_json(byte, ignore_unknown_fields=True)) + if len(shards) > 1: + shards.sort(key=lambda x: int(x.shard_info.shard_index)) return shards def _text_from_shards(shards: List[documentai.Document]) -> str: + r"""Gets text from shards. + + Args: + shards (List[google.cloud.documentai.Document]): + Required. List of document shards. + Returns: + str: + Text in all shards. + """ total_text = "" for shard in shards: if total_text == "": @@ -174,54 +190,38 @@ def _text_from_shards(shards: List[documentai.Document]) -> str: return total_text -def print_gcs_document_tree(gcs_bucket_name: str, gcs_prefix: str) -> None: - r"""Prints a tree of filenames in Cloud Storage folder. +def _convert_to_vision_annotate_file_response(text: str, pages: List[page.Page]): + r"""Convert OCR data from Document.proto to AnnotateFileResponse.proto for Vision API. Args: - gcs_bucket_name (str): - Required. The name of the gcs bucket. - - Format: `gs://bucket/optional_folder/target_folder/` where gcs_bucket_name=`bucket`. - gcs_prefix (str): - Required. The prefix of the json files in the target_folder. - - Format: `gs://bucket/optional_folder/target_folder/` where gcs_prefix=`optional_folder/target_folder`. + text (str): + Required. Contents of document. + pages (List[Page]): + Required. A list of pages. Returns: - None. - + AnnotateFileResponse: + Proto with TextAnnotations. """ - FILENAME_TREE_MIDDLE = "├──" - FILENAME_TREE_LAST = "└──" - FILES_TO_DISPLAY = 4 - - file_check = re.match(constants.FILE_CHECK_REGEX, gcs_prefix) - - if file_check is not None: - raise ValueError("gcs_prefix cannot contain file types") - - storage_client = _get_storage_client() - blob_list = storage_client.list_blobs(gcs_bucket_name, prefix=gcs_prefix) - - path_list: Dict[str, List[str]] = {} - - for blob in blob_list: - directory, file_name = os.path.split(blob.name) + responses = [] + vision_file_response = AnnotateFileResponse() + page_idx = 0 + while page_idx < len(pages): + page_info = PageInfo(pages[page_idx].documentai_page, text) + page_vision_annotation = _convert_document_page(page_info) + page_vision_annotation.text = _get_text_anchor_substring( + text, pages[page_idx].documentai_page.layout.text_anchor + ) + responses.append( + AnnotateImageResponse( + full_text_annotation=page_vision_annotation, + context=ImageAnnotationContext(page_number=page_idx + 1), + ) + ) + page_idx += 1 - if directory in path_list: - path_list[directory].append(file_name) - else: - path_list[directory] = [file_name] + vision_file_response.responses = responses - for directory, files in path_list.items(): - print(f"{directory}") - dir_size = len(files) - for idx, file_name in enumerate(files): - if idx == dir_size - 1: - if dir_size > FILES_TO_DISPLAY: - print("│ ....") - print(f"{FILENAME_TREE_LAST}{file_name}\n") - elif idx <= FILES_TO_DISPLAY: - print(f"{FILENAME_TREE_MIDDLE}{file_name}") + return vision_file_response @dataclasses.dataclass @@ -240,11 +240,11 @@ class Document: gcs_bucket_name (Optional[str]): Optional. The name of the gcs bucket. - Format: `gs://bucket/optional_folder/target_folder/` where gcs_bucket_name=`bucket`. + Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/` where gcs_bucket_name=`bucket`. gcs_prefix (Optional[str]): Optional. The prefix of the json files in the target_folder. - Format: `gs://bucket/optional_folder/target_folder/` where gcs_prefix=`optional_folder/target_folder`. + Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/` where gcs_prefix=`{optional_folder}/{target_folder}`. For more information please take a look at https://cloud.google.com/storage/docs/json_api/v1/objects/list . pages: (List[Page]): @@ -315,7 +315,7 @@ def from_gcs(cls, gcs_bucket_name: str, gcs_prefix: str): gcs_prefix (str): Required. The prefix to the location of the target folder. - Format: Given `gs://{bucket_name}/optional_folder/target_folder` where gcs_prefix=`{optional_folder}/{target_folder}`. + Format: Given `gs://{bucket_name}/{optional_folder}/{target_folder}` where gcs_prefix=`{optional_folder}/{target_folder}`. Returns: Document: A document from gcs. @@ -347,12 +347,12 @@ def search_pages( ) found_pages = [] - for page in self.pages: - for paragraph in page.paragraphs: + for p in self.pages: + for paragraph in p.paragraphs: if (target_string and target_string in paragraph.text) or ( pattern and re.search(pattern, paragraph.text) ): - found_pages.append(page) + found_pages.append(p) break return found_pages @@ -369,8 +369,8 @@ def get_form_field_by_name(self, target_field: str) -> List[FormField]: """ found_fields = [] - for page in self.pages: - for form_field in page.form_fields: + for p in self.pages: + for form_field in p.form_fields: if target_field.lower() in form_field.field_name.lower(): found_fields.append(form_field) @@ -495,11 +495,12 @@ def split_pdf(self, pdf_path: str, output_path: str) -> List[str]: return output_files def convert_document_to_annotate_file_response(self) -> AnnotateFileResponse: - """Convert OCR data from Document proto to AnnotateFileResponse proto (Vision API). + r"""Convert OCR data from Document.proto to AnnotateFileResponse.proto for Vision API. Args: None. Returns: - AnnotateFileResponse proto with a TextAnnotation per page. + AnnotateFileResponse: + Proto with TextAnnotations. """ return _convert_to_vision_annotate_file_response(self.text, self.pages) diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py index b6162c65..f740873b 100644 --- a/google/cloud/documentai_toolbox/wrappers/page.py +++ b/google/cloud/documentai_toolbox/wrappers/page.py @@ -326,11 +326,14 @@ class Page: Required. The original google.cloud.documentai.Document.Page object. text: (str): Required. The full text of the Document containing the Page. - lines (List[str]): + form_fields (List[FormField]): + Required. A list of visually detected form fields on the + page. + lines (List[Line]): Required. A list of visually detected text lines on the page. A collection of tokens that a human would perceive as a line. - paragraphs (List[str]): + paragraphs (List[Paragraph]): Required. A list of visually detected text paragraphs on the page. A collection of lines that a human would perceive as a paragraph. diff --git a/samples/sample-converter-configs/AWS/AWS-config.json b/samples/sample-converter-configs/AWS/AWS-config.json new file mode 100644 index 00000000..d33c1d50 --- /dev/null +++ b/samples/sample-converter-configs/AWS/AWS-config.json @@ -0,0 +1,14 @@ +{ + "entity_object":"Blocks", + "entity": { + "type_":"BlockType", + "mention_text":"Text", + "normalized_vertices":{ + "type":"1", + "unit":"normalized", + "base":"Geometry.Polygon", + "x":"X", + "y":"Y" + } + } +} \ No newline at end of file diff --git a/samples/sample-converter-configs/Azure/form-config.json b/samples/sample-converter-configs/Azure/form-config.json new file mode 100644 index 00000000..19749112 --- /dev/null +++ b/samples/sample-converter-configs/Azure/form-config.json @@ -0,0 +1,18 @@ +{ + "entity_object":"analyzeResult.pageResults.0.keyValuePairs", + "page": { + "height":"analyzeResult.readResults.0.height", + "width":"analyzeResult.readResults.0.width" + }, + "entity": { + "type_":"key.text", + "mention_text":"value.text", + "normalized_vertices":{ + "type":"3", + "unit":"inch", + "base":"key.boundingBox", + "x":"x", + "y":"y" + } + } +} \ No newline at end of file diff --git a/samples/sample-converter-configs/Azure/invoice-config.json b/samples/sample-converter-configs/Azure/invoice-config.json new file mode 100644 index 00000000..3ec3468e --- /dev/null +++ b/samples/sample-converter-configs/Azure/invoice-config.json @@ -0,0 +1,18 @@ +{ + "entity_object":"analyzeResult.documentResults.0.fields", + "page": { + "height":"analyzeResult.readResults.0.height", + "width":"analyzeResult.readResults.0.width" + }, + "entity": { + "type_":"analyzeResult.documentResults.0.fields:self", + "mention_text":"text", + "normalized_vertices":{ + "type":"3", + "unit":"pxl", + "base":"boundingBox", + "x":"x", + "y":"y" + } + } +} \ No newline at end of file diff --git a/samples/snippets/convert_external_annotations_sample.py b/samples/snippets/convert_external_annotations_sample.py new file mode 100644 index 00000000..02fa0d15 --- /dev/null +++ b/samples/snippets/convert_external_annotations_sample.py @@ -0,0 +1,71 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# [START documentai_toolbox_convert_external_annotations] + +from google.cloud.documentai_toolbox import converter + +# TODO(developer): Uncomment these variables before running the sample. +# This sample will convert external annotations to the Document.json format used by Document AI Workbench for training. +# To process this the external annotation must have these type of objects: +# 1) Type +# 2) Text +# 3) Bounding Box (bounding boxes must be 1 of the 3 optional types) +# +# This is the bare minimum requirement to convert the annotations but for better accuracy you will need to also have: +# 1) Document width & height +# +# Bounding Box Types: +# Type 1: +# bounding_box:[{"x":1,"y":2},{"x":2,"y":2},{"x":2,"y":3},{"x":1,"y":3}] +# Type 2: +# bounding_box:{ "Width": 1, "Height": 1, "Left": 1, "Top": 1} +# Type 3: +# bounding_box: [1,2,2,2,2,3,1,3] +# +# Note: If these types are not sufficient you can propose a feature request or contribute the new type and conversion functionality. +# +# Given a folders in gcs_input_path with the following structure : +# +# gs://path/to/input/folder +# ├──test_annotations.json +# ├──test_config.json +# └──test.pdf +# +# An example of the config is in sample-converter-configs/Azure/form-config.json +# +# location = "us", +# processor_id = "my_processor_id" +# gcs_input_path = "gs://path/to/input/folder" +# gcs_output_path = "gs://path/to/input/folder" + + +def convert_external_annotations_sample( + location: str, + processor_id: str, + project_id: str, + gcs_input_path: str, + gcs_output_path: str, +) -> None: + converter.convert_from_config( + project_id=project_id, + location=location, + processor_id=processor_id, + gcs_input_path=gcs_input_path, + gcs_output_path=gcs_output_path, + ) + + +# [END documentai_toolbox_convert_external_annotations] diff --git a/samples/snippets/create_batches_sample.py b/samples/snippets/create_batches_sample.py new file mode 100644 index 00000000..0847c170 --- /dev/null +++ b/samples/snippets/create_batches_sample.py @@ -0,0 +1,52 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +# [START documentai_toolbox_create_batches] + +from google.cloud import documentai +from google.cloud.documentai_toolbox import utilities + +# TODO(developer): Uncomment these variables before running the sample. +# Given unprocessed documents in path gs://bucket/path/to/folder +# gcs_bucket_name = "bucket" +# gcs_prefix = "path/to/folder" +# batch_size = 50 + + +def create_batches_sample( + gcs_bucket_name: str, + gcs_prefix: str, + batch_size: int = 50, +) -> None: + # Creating batches of documents for processing + batches = utilities.create_batches( + gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix, batch_size=batch_size + ) + + print(f"{len(batches)} batch(es) created.") + for batch in batches: + print(f"{len(batch.gcs_documents.documents)} files in batch.") + print(batch.gcs_documents.documents) + + # Use as input for batch_process_documents() + # Refer to https://cloud.google.com/document-ai/docs/send-request + # for how to send a batch processing request + request = documentai.BatchProcessRequest( + name="processor_name", input_documents=batch + ) + + +# [END documentai_toolbox_create_batches] diff --git a/samples/snippets/noxfile.py b/samples/snippets/noxfile.py index 22c65bbd..1765c2a8 100644 --- a/samples/snippets/noxfile.py +++ b/samples/snippets/noxfile.py @@ -206,9 +206,7 @@ def _session_tests( if os.path.exists("requirements-test.txt"): if os.path.exists("constraints-test.txt"): - session.install( - "-r", "requirements-test.txt", "-c", "constraints-test.txt" - ) + session.install("-r", "requirements-test.txt", "-c", "constraints-test.txt") else: session.install("-r", "requirements-test.txt") with open("requirements-test.txt") as rtfile: @@ -221,9 +219,9 @@ def _session_tests( post_install(session) if "pytest-parallel" in packages: - concurrent_args.extend(['--workers', 'auto', '--tests-per-worker', 'auto']) + concurrent_args.extend(["--workers", "auto", "--tests-per-worker", "auto"]) elif "pytest-xdist" in packages: - concurrent_args.extend(['-n', 'auto']) + concurrent_args.extend(["-n", "auto"]) session.run( "pytest", @@ -282,4 +280,4 @@ def readmegen(session: nox.sessions.Session, path: str) -> None: in_file = os.path.join(dir_, "README.rst.in") session.run( "python", _get_repo_root() + "/scripts/readme-gen/readme_gen.py", in_file - ) \ No newline at end of file + ) diff --git a/samples/snippets/quickstart_sample.py b/samples/snippets/quickstart_sample.py index e7c25fdb..c3e41670 100644 --- a/samples/snippets/quickstart_sample.py +++ b/samples/snippets/quickstart_sample.py @@ -17,6 +17,7 @@ # [START documentai_toolbox_quickstart] from google.cloud.documentai_toolbox import document +from google.cloud.documentai_toolbox import utilities # TODO(developer): Uncomment these variables before running the sample. # Given a document.proto or sharded document.proto in path gs://bucket/path/to/folder @@ -25,6 +26,11 @@ def quickstart_sample(gcs_bucket_name: str, gcs_prefix: str) -> None: + print("Document structure in Cloud Storage") + utilities.print_gcs_document_tree( + gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix + ) + wrapped_document = document.Document.from_gcs( gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix ) diff --git a/samples/snippets/requirements-test.txt b/samples/snippets/requirements-test.txt index 6732e7a7..91d51543 100644 --- a/samples/snippets/requirements-test.txt +++ b/samples/snippets/requirements-test.txt @@ -1,3 +1,3 @@ -pytest==7.2.1 +pytest==7.2.2 mock==5.0.1 google-cloud-bigquery==3.6.0 diff --git a/samples/snippets/requirements.txt b/samples/snippets/requirements.txt index bf98fe35..e955a0c3 100644 --- a/samples/snippets/requirements.txt +++ b/samples/snippets/requirements.txt @@ -1,4 +1,4 @@ google-cloud-bigquery==3.6.0 -google-cloud-documentai==2.13.0 +google-cloud-documentai==2.14.0 google-cloud-storage==2.7.0 google-cloud-documentai-toolbox==0.1.1a0 diff --git a/samples/snippets/test_convert_document_to_vision_sample.py b/samples/snippets/test_convert_document_to_vision_sample.py index 0f782fc4..668e1acd 100644 --- a/samples/snippets/test_convert_document_to_vision_sample.py +++ b/samples/snippets/test_convert_document_to_vision_sample.py @@ -24,7 +24,7 @@ gcs_input_uri = "output/123456789/0" -def test_quickstart_sample(capsys: pytest.CaptureFixture) -> None: +def test_convert_document_to_vision_sample(capsys: pytest.CaptureFixture) -> None: convert_document_to_vision_sample.convert_document_to_vision_sample( gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_input_uri ) diff --git a/samples/snippets/test_convert_external_annotations_sample.py b/samples/snippets/test_convert_external_annotations_sample.py new file mode 100644 index 00000000..63acecdb --- /dev/null +++ b/samples/snippets/test_convert_external_annotations_sample.py @@ -0,0 +1,35 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os + +import pytest +from samples.snippets import convert_external_annotations_sample + +location = "us" +project_id = os.environ["GOOGLE_CLOUD_PROJECT"] + + +def test_convert_external_annotations_sample(capsys: pytest.CaptureFixture) -> None: + convert_external_annotations_sample.convert_external_annotations_sample( + location=location, + processor_id="52a38e080c1a7296", + project_id="project_id", + gcs_input_path="gs://documentai_toolbox_samples/documentai_toolbox_samples/converter/azure", + gcs_output_path="gs://documentai_toolbox_samples/documentai_toolbox_samples/converter/output", + ) + out, _ = capsys.readouterr() + + assert "-------- Finished Converting --------" in out diff --git a/samples/snippets/test_create_batches_sample.py b/samples/snippets/test_create_batches_sample.py new file mode 100644 index 00000000..b13c6279 --- /dev/null +++ b/samples/snippets/test_create_batches_sample.py @@ -0,0 +1,33 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import pytest +from samples.snippets import create_batches_sample + +gcs_bucket_name = "cloud-samples-data" +gcs_input_uri = "documentai_toolbox/document_batches/" +batch_size = 50 + + +def test_create_batches_sample(capsys: pytest.CaptureFixture) -> None: + create_batches_sample.create_batches_sample( + gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_input_uri, batch_size=batch_size + ) + out, _ = capsys.readouterr() + + assert "2 batch(es) created." in out + assert "50 files in batch." in out + assert "47 files in batch." in out diff --git a/samples/snippets/test_quickstart_sample.py b/samples/snippets/test_quickstart_sample.py index d48ba91c..912a27d8 100644 --- a/samples/snippets/test_quickstart_sample.py +++ b/samples/snippets/test_quickstart_sample.py @@ -30,5 +30,6 @@ def test_quickstart_sample(capsys: pytest.CaptureFixture) -> None: ) out, _ = capsys.readouterr() + assert "Document structure in Cloud Storage" in out assert "Number of Pages: 1" in out assert "Number of Entities: 35" in out diff --git a/setup.py b/setup.py index 9791bdf9..b4ca2f9b 100644 --- a/setup.py +++ b/setup.py @@ -54,6 +54,7 @@ "google-cloud-storage >= 1.31.0, < 3.0.0dev", "google-cloud-vision >= 2.7.0, < 4.0.0dev ", "numpy >= 1.18.1", + "intervaltree >= 3.0.0", "pikepdf >= 6.2.9, < 8.0.0", "immutabledict >= 2.0.0, < 3.0.0dev", ), diff --git a/tests/unit/resources/converters/test_config_type_1.json b/tests/unit/resources/converters/test_config_type_1.json new file mode 100644 index 00000000..554bdaf9 --- /dev/null +++ b/tests/unit/resources/converters/test_config_type_1.json @@ -0,0 +1,15 @@ +{ + "entity_object":"pages.1.Entities", + "entity": { + "mention_text":"Text", + "type_":"Type", + "page_number": "page", + "normalized_vertices":{ + "type":"1", + "unit":"inch", + "base":"bBox", + "x":"x", + "y":"y" + } + } +} \ No newline at end of file diff --git a/tests/unit/resources/converters/test_config_type_2.json b/tests/unit/resources/converters/test_config_type_2.json new file mode 100644 index 00000000..5403f0f5 --- /dev/null +++ b/tests/unit/resources/converters/test_config_type_2.json @@ -0,0 +1,16 @@ +{ + "entity_object":"document.entities", + "entity": { + "type_":"type", + "mention_text":"mentionText", + "normalized_vertices":{ + "type":"2", + "unit":"normalized", + "base":"pageAnchor.pageRefs.0.boundingPoly.normalizedVertices", + "x":"left", + "y":"top", + "width":"width", + "height":"height" + } + } +} \ No newline at end of file diff --git a/tests/unit/resources/converters/test_config_type_3.json b/tests/unit/resources/converters/test_config_type_3.json new file mode 100644 index 00000000..8b45f0bd --- /dev/null +++ b/tests/unit/resources/converters/test_config_type_3.json @@ -0,0 +1,21 @@ +{ + "entity_object":"Entities", + "page": { + "height":"page_height", + "width":"page_width" + }, + "entity": { + "type_":"Entities:self", + "mention_text":"Text||normalizedText", + "normalized_vertices":{ + "type":"3", + "unit":"pxl", + "base":"bBox", + "x":"x", + "y":"y" + }, + "id":"id", + "confidence":"confidence", + "page_number":"page" + } +} \ No newline at end of file diff --git a/tests/unit/resources/converters/test_type_1.json b/tests/unit/resources/converters/test_type_1.json new file mode 100644 index 00000000..5fd4ef28 --- /dev/null +++ b/tests/unit/resources/converters/test_type_1.json @@ -0,0 +1,36 @@ +{ + "DocumentType": "ScannedPDF", + "NoOfPages": 1, + "pages": [ + {}, + { + "Entities": [ + { + "Type": "BusinessName", + "Text": "411 I.T. Group", + "id":0, + "bBox": [ + { + "x": 4.083333, + "y": 1.208333 + }, + { + "x": 5.8125, + "y": 1.208333 + }, + { + "x": 5.8125, + "y": 1.510416 + }, + { + "x": 4.083333, + "y": 1.510416 + } + ], + "page": "0", + "confidence": 0.9997831 + } + ] + } + ] +} \ No newline at end of file diff --git a/tests/unit/resources/converters/test_type_2.json b/tests/unit/resources/converters/test_type_2.json new file mode 100644 index 00000000..5ed2411b --- /dev/null +++ b/tests/unit/resources/converters/test_type_2.json @@ -0,0 +1,30 @@ +{ + "document": { + "uri": "", + "mimeType": "application/pdf", + "page_height":1000, + "page_width":1000, + "entities": [ + { + "type": "invoice_id", + "mentionText": "4748", + "confidence": 0.980109, + "pageAnchor": { + "pageRefs": [ + { + "boundingPoly": { + "normalizedVertices": { + "width": 0.03, + "height":0.01, + "left": 0.07906712, + "top": 0.36043957 + } + } + } + ] + }, + "id": "0" + } + ] + } +} \ No newline at end of file diff --git a/tests/unit/resources/converters/test_type_3.json b/tests/unit/resources/converters/test_type_3.json new file mode 100644 index 00000000..ebba08cd --- /dev/null +++ b/tests/unit/resources/converters/test_type_3.json @@ -0,0 +1,25 @@ +{ + "DocumentType": "ScannedPDF", + "NoOfPages": 1, + "page_height":1000, + "page_width":1000, + "Entities": { + "BusinessName": { + "Text": "411 I.T. Group", + "normalizedText":"normalized 411 I.T. Group", + "id":0, + "bBox": [ + 392, + 116, + 558, + 116, + 558, + 145, + 392, + 145 + ], + "page": "0", + "confidence": 0.9997831 + } + } + } \ No newline at end of file diff --git a/tests/unit/resources/unordered_shards/Winnie_the_Pooh_50_Pages-0.json b/tests/unit/resources/unordered_shards/Winnie_the_Pooh_50_Pages-0.json new file mode 100644 index 00000000..19424fc0 --- /dev/null +++ b/tests/unit/resources/unordered_shards/Winnie_the_Pooh_50_Pages-0.json @@ -0,0 +1 @@ +{"pages":[{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1596},{"x":1596,"y":2505},{"y":2505}]},"confidence":0.98390293,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"942"}]}},"pageNumber":41},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1602},{"x":1602,"y":2496},{"y":2496}]},"confidence":0.98344266,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"2211","startIndex":"942"}]}},"pageNumber":42},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1602},{"x":1602,"y":2496},{"y":2496}]},"confidence":0.79652208,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"2573","startIndex":"2211"}]}},"pageNumber":43},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1622},{"x":1622,"y":2465},{"y":2465}]},"confidence":0.97713888,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"3381","startIndex":"2573"}]}},"pageNumber":44},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1597},{"x":1597,"y":2503},{"y":2503}]},"confidence":0.87524492,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"3599","startIndex":"3381"}]}},"pageNumber":45},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1616},{"x":1616,"y":2473},{"y":2473}]},"confidence":0.98405439,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"4424","startIndex":"3599"}]}},"pageNumber":46},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1605},{"x":1605,"y":2490},{"y":2490}]},"confidence":0.97508377,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"5175","startIndex":"4424"}]}},"pageNumber":47},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1619},{"x":1619,"y":2469},{"y":2469}]},"confidence":0.98273796,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"6181","startIndex":"5175"}]}},"pageNumber":48},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1605},{"x":1605,"y":2490},{"y":2490}]},"confidence":0.97522026,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"7366","startIndex":"6181"}]}},"pageNumber":49},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1609},{"x":1609,"y":2484},{"y":2484}]},"confidence":0.97771299,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"8532","startIndex":"7366"}]}},"pageNumber":50}],"shardInfo":{"shardCount":"5","shardIndex":"4","textOffset":"27701"},"text":"WINNIE-THE-POOH\n\"Pooh!\" cried Piglet. \"Do you think it is another\nWoozle?\"\n38\n\"No,\" said Pooh, \"because it makes different marks.\nIt is either Two Woozles and one, as it might be,\nWizzle, or Two, as it might be, Wizzles and one,\nif so it is, Woozle. Let us continue to follow them.'\nSo they went on, feeling just a little anxious now,\nin case the three animals in front of them were of\nHostile Intent. And Piglet wished very much that\nhis Grandfather T. W. were there, instead of else-\nwhere, and Pooh thought how nice it would be if\nthey met Christopher Robin suddenly but quite ac-\ncidentally, and only because he liked Christopher\nRobin so much. And then, all of a sudden, Winnie-\nthe-Pooh stopped again, and licked the tip of his\nnose in a cooling manner, for he was feeling more\nhot and anxious than ever in his life before. There\nwere four animals in front of them!\n\"Do you see, Piglet? Look at their tracks! Three,\nDigitized by\nGoogle\nPOOH AND PIGLET HUNT\n39\nas it were, Woozles, and one, as it was, Wizzle. An-\nother Woozle has joined them!”\nAnd so it seemed to be. There were the tracks;\ncrossing over each other here, getting muddled up\nwith each other there; but, quite plainly every now\nand then, the tracks of four sets of paws.\n\"I think,\" said Piglet, when he had licked the tip\nof his nose too, and found that it brought very little\ncomfort, \"I think that I have just remembered\nsomething. I have just remembered something that\nI forgot to do yesterday and shan't be able to do to-\nmorrow. So I suppose I really ought to go back and\ndo it now.'\n\"We'll do it this afternoon, and I'll come with\nyou,\" said Pooh.\n\"It isn't the sort of thing you can do in the after-\nnoon,” said Piglet quickly. “It's a very particular\nmorning thing, that has to be done in the morning,\nand, if possible, between the hours of What\nwould you say the time was?\"\n\"About twelve,\" said Winnie-the-Pooh, looking at\nthe sun.\n\"Between, as I was saying, the hours of twelve and\ntwelve five. So, really, dear old Pooh, if you'll ex-\ncuse me- What's that?\"\nPooh looked up at the sky, and then, as he heard\nthe whistle again, he looked up into the branches of\na big oak-tree, and then he saw a friend of his.\nDigitized by\nGoogle\n40\n\"It's Christopher Robin,\" he said.\nWINNIE-THE-POOH\nDigitized by\nMart\nAM\n\"Ah, then you'll be all right,\" said Piglet. \"You'll\nbe quite safe with him. Good-bye,\" and he trotted\noff home as quickly as he could, very glad to be\nOut of All Danger again.\nGoogle\n13\nWATER\nChristopher Robin came slowly down his tree.\n\"Silly old Bear,\" he said, \"what were you doing?\nPOOH AND PIGLET HUNT\n41\nFirst you went round the spinney twice by your-\nself, and then Piglet ran after you and you went\nround again together, and then you were just going\nround a fourth time--\"\n\"Wait a moment,\" said Winnie-the-Pooh, holding\nup his paw.\nHe sat down and thought, in the most thoughtful\nway he could think. Then he fitted his paw into\none of the Tracks . . . and then he scratched his\nnose twice, and stood up.\n\"Yes,\" said Winnie-the-Pooh.\n\"I see now,\" said Winnie-the-Pooh.\n\"I have been Foolish and Deluded,\" said he, \"and\nI am a Bear of No Brain at All.\"\n\"You're the Best Bear in All the World,” said\nChristopher Robin soothingly.\n\"Am I?\" said Pooh hopefully. And then he bright-\nened up suddenly.\n\"Anyhow,\" he said, \"it is nearly Luncheon Time.\"\nSo he went home for it.\nDigitized by Google\nIN WHICH Eeyore Loses a Tail\nand Pooh Finds One\nTHE Old Grey Donkey, Eeyore,\nstood by himself in a thistly corner of the forest,\nhis front feet well apart, his head on one side, and\nC\nCHAPTER IV\n42\nDigitized by\nGoogle\nEEYORE LOSES A TAIL\n43\nthought about things. Sometimes he thought sadly\nto himself, \"Why?\" and sometimes he thought,\n\"Wherefore?\" and sometimes he thought, \"Inas-\nmuch as which?\"-and sometimes he didn't quite\nknow what he was thinking about. So when Winnie-\nthe-Pooh came stumping along, Eeyore was very\nglad to be able to stop thinking for a little, in order\nto say \"How do you do?\" in a gloomy manner to\nhim.\n\"And how are you?\" said Winnie-the-Pooh.\nEeyore shook his head from side to side.\n\"Not very how,\" he said. \"I don't seem to have\nfelt at all how for a long time.\"\n\"Dear, dear,\" said Pooh, \"I'm sorry about that.\nLet's have a look at you.\"\nSo Eeyore stood there, gazing sadly at the ground,\nand Winnie-the-Pooh walked all round him once.\n\"Why, what's happened to your tail?\" he said in\nsurprise.\nDigitized by\nGoogle\n44\n\"What has happened to it?\" said Eeyore.\n\"It isn't there!\"\n\"Are you sure?\"\nWINNIE-THE-POOH\n\"Well, either a tail is there or it isn't there. You\ncan't make a mistake about it. And yours isn't\nthere!\"\n\"Then what is?\"\n\"Nothing.\"\n\"Let's have a look,\" said Eeyore, and he turned\nslowly round to the place where his tail had been a\nlittle while ago, and then, finding that he couldn't\ncatch it up, he turned round the other way, until he\ncame back to where he was at first, and then he put\nhis head down and looked between his front legs,\nand at last he said, with a long, sad sigh, \"I believe\nyou're right.\"\n\"Of course I'm right,\" said Pooh.\n\"That Accounts for a Good Deal,\" said Eeyore\ngloomily. \"It Explains Everything. No Wonder.\"\nDigitized by\nGoogle\nEEYORE LOSES A TAIL\n45\n\"You must have left it somewhere,\" said Winnie-\nthe-Pooh.\n\"Somebody must have taken it,\" said Eeyore. \"How\nLike Them,\" he added, after a long silence.\nPooh felt that he ought to say something helpful\nabout it, but didn't quite know what. So he decided\nto do something helpful instead.\n\"Eeyore,\" he said solemnly, \"I, Winnie-the-Pooh,\nwill find your tail for you.\"\nn\n\"Thank you, Pooh,\" answered Eeyore. \"You're a\nreal friend,\" said he. \"Not like Some,\" he said.\nSo Winnie-the-Pooh went off to find Eeyore's tail.\nIt was a fine spring morning in the forest as he\nstarted out. Little soft clouds played happily in a\nblue sky, skipping from time to time in front of the\nsun as if they had come to put it out, and then slid-\ning away suddenly so that the next might have his\nturn. Through them and between them the sun\nshone bravely; and a copse which had worn its firs\nall the year round seemed old and dowdy now be-\nside the new green lace which the beeches had put\nDigitized by\nGoogle\n46\non so prettily. Through copse and spinney marched\nBear; down open slopes of gorse and heather, over\nrocky beds of streams, up steep banks of sandstone\ninto the heather again; and so at last, tired and hun-\ngry, to the Hundred Acre Wood. For it was in the\nHundred Acre Wood that Owl lived.\n\"And if anyone knows anything about anything,\"\nsaid Bear to himself, \"it's Owl who knows some-\nthing about something,\" he said, “or my name's not\nWinnie-the-Pooh,” he said. “Which it is,” he added.\n\"So there\nyou are.\nOwl lived at The Chestnuts, an old-world resi-\ndence of great charm, which was grander than any-\nbody else's, or seemed so to Bear, because it had\nboth a knocker and a bell-pull. Underneath the\nknocker there was a notice which said:\nWINNIE-THE-POOH\nPLES RING IF AN RNSER IS REQIRD.\nUnderneath the bell-pull there was a notice which\nsaid:\nPLEZ CNOKE IF AN RNSR IS NOT REQID.\nThese notices had been written by Christopher\nRobin, who was the only one in the forest who\ncould spell; for Owl, wise though he was in many\nways, able to read and write and spell his own name\nWOL, yet somehow went all to pieces over delicate\nwords like MEASLES and BUTTERED TOAST.\nDigitized by\nGoogle\n48\nWINNIE-THE-POOH\nWinnie-the-Pooh read the two notices very care-\nfully, first from left to right, and afterwards, in case\nhe had missed some of it, from right to left. Then,\nto make quite sure, he knocked and pulled the\nknocker, and he pulled and knocked the bell-rope,\nand he called out in a very loud voice, “Owl! I re-\nquire an answer! It's Bear speaking.\" And the door\nopened, and Owl looked out.\n\"Hallo, Pooh,\" he said. \"How's things?\"\n\"Terrible and Sad,\" said Pooh, \"because Eeyore,\nwho is a friend of mine, has lost his tail. And he's\nMoping about it. So could you very kindly tell me\nhow to find it for him?\"\n\"Well,\" said Owl, \"the customary procedure in\nsuch cases is as follows.\"\n\"What does Crustimoney Proseedcake mean?” said\nPooh. \"For I am a Bear of Very Little Brain, and\nlong words, Bother me.\"\n\"It means the Thing to Do.\"\n\"As long as it means that, I don't mind,\" said Pooh\nhumbly.\n\"The thing to do is as follows. First, Issue a Re-\nward. Then--\"\n\"Just a moment,\" said Pooh, holding up his paw.\n“What do we do to this-what you were saying?\nYou sneezed just as you were going to tell me.\"\n\"I didn't sneeze.\"\n\"Yes, you did, Owl.\"\nDigitized by\nGoogle\n"} \ No newline at end of file diff --git a/tests/unit/resources/unordered_shards/Winnie_the_Pooh_50_Pages-1.json b/tests/unit/resources/unordered_shards/Winnie_the_Pooh_50_Pages-1.json new file mode 100644 index 00000000..9b687d7e --- /dev/null +++ b/tests/unit/resources/unordered_shards/Winnie_the_Pooh_50_Pages-1.json @@ -0,0 +1 @@ +{"pages":[{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1584},{"x":1584,"y":2524},{"y":2524}]},"confidence":0.97242725,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"965"}]}},"pageNumber":21},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1589},{"x":1589,"y":2516},{"y":2516}]},"confidence":0.62005234,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"1535","startIndex":"965"}]}},"pageNumber":22},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1593},{"x":1593,"y":2510},{"y":2510}]},"confidence":0.980977,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"2186","startIndex":"1535"}]}},"pageNumber":23},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1604},{"x":1604,"y":2492},{"y":2492}]},"confidence":0.85369617,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"2379","startIndex":"2186"}]}},"pageNumber":24},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1588},{"x":1588,"y":2518},{"y":2518}]},"confidence":0.98032546,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"3506","startIndex":"2379"}]}},"pageNumber":25},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1596},{"x":1596,"y":2505},{"y":2505}]},"confidence":0.94498634,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"4167","startIndex":"3506"}]}},"pageNumber":26},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1586},{"x":1586,"y":2521},{"y":2521}]},"confidence":0.91387296,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"5439","startIndex":"4167"}]}},"pageNumber":27},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1603},{"x":1603,"y":2494},{"y":2494}]},"confidence":0.97235525,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"6175","startIndex":"5439"}]}},"pageNumber":28},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1598},{"x":1598,"y":2501},{"y":2501}]},"confidence":0.87959528,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"6754","startIndex":"6175"}]}},"pageNumber":29},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1608},{"x":1608,"y":2486},{"y":2486}]},"confidence":0.97745353,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"8048","startIndex":"6754"}]}},"pageNumber":30}],"shardInfo":{"shardCount":"5","shardIndex":"2","textOffset":"13632"},"text":"18\nslowly out, and Winnie-the-Pooh floated down to\nthe ground.\nBut his arms were so stiff from holding on to the\nstring of the balloon all that time that they stayed\nup straight in the air for more than a week, and\nwhenever a fly came and settled on his nose he had\nto blow it off. And I think-but I am not sure-that\nthat is why he was always called Pooh.\nWINNIE-THE-POOH\n\"Is that the end of the story?\" asked Christopher\nRobin.\n\"That's the end of that one. There are others.\"\n\"About Pooh and Me?\"\n\"And Piglet and Rabbit and all of you. Don't you\nremember?\"\n\"I do remember, and then when I try to remem-\nber, I forget.\"\n\"That day when Pooh and Piglet tried to catch\nthe Heffalump-\n\"They didn't catch it, did they?\"\n\"No.\"\n\"Pooh couldn't, because he hasn't any brain. Did\n1 catch it?\"\n\"Well, that comes into the story.\"\nChristopher Robin nodded.\n\"I do remember,\" he said, \"only Pooh doesn't very\nwell, so that's why he likes having it told to him\nDigitized by\nGoogle\n01\nth\nre\nn\na\nha\nme\n01\n1\nch\nid\nFY\n11\nWE ARE INTRODUCED\n19\nagain. Because then it's a real story and not just a\nremembering.\"\n\"That's just how I feel,\" I said.\nChristopher Robin gave a deep sigh, picked his\nBear up by the leg, and walked off to the door,\ntrailing Pooh behind him. At the door he turned\nand said, \"Coming to see me have my bath?\"\n\"I might,\" I said.\nG\nTUFF\nLVW HIVS\n\"I didn't hurt him when I shot him, did I?\"\n\"Not a bit.\"\nHe nodded and went out, and in a moment\nI heard Winnie-the-Pooh-bump, bump, bump-\ngoing up the stairs behind him.\nDigitized by\nGoogle\nCHAPTER II\nIN WHICH Pooh Goes Visiting and\nGets Into a Tight Place\nEDWARD\nDWARD BEAR, known to his\nfriends as Winnie-the-Pooh, or Pooh for short, was\nwalking through the forest one day, humming\nproudly to himself. He had made up a little hum\nthat very morning, as he\nwas doing his Stoutness Ex-\nercises in front of the glass:\nTra-la-la, tra-la-la, as he.\nstretched up as high as he\ncould go, and then Tra-\nla-la, tra-la-oh, help!-la,\nas he tried to reach his toes.\nAfter breakfast he had said\nit over and over to himself until he had learnt it off\nby heart, and now he was humming it right through.\nproperly. It went like this:\n20\nDigitized by\nGoogle\n25\ne\nPOOH GOES VISITING\nTra-la-la, tra-la-la,\nTra-la-la, tra-la-la,\nRum-tum-tiddle-um-tum.\nTiddle-iddle, tiddle-iddle,\nTiddle-iddle, tiddle-iddle,\nRum-tum-tum-tiddle-um.\nDigitized by\nGoogle\n21\nWINNIE-THE-POOR\nWell, he was humming this hum to himself, and\nwalking along gaily, wondering what everybody\nelse was doing, and what it felt like, being some-\nbody else, when suddenly he came to a sandy bank,\nand in the bank was a large hole.\n\"Aha!\" said Pooh. (Rum-tum-tiddle-um-tum.) \"If\nI know anything about anything, that hole means\nRabbit,\" he said, \"and Rabbit means Company,\" he\nsaid, “and Company means Food and Listening-to\nMe-Humming and such like. Rum-tum-tum-\ntiddle-um.\"\n22\nSo he bent down, put his head into the hole, and\ncalled out:\n\"Is anybody at home?”\nThere was a sudden scuffling noise from inside\nthe hole, and then silence.\n\"What I said was, 'Is anybody at home?”” called\nout Pooh very loudly.\n\"No!\" said a voice; and then added, \"You needn't\nshout so loud. I heard you quite well the first time.\"\n\"Bother!\" said Pooh. \"Isn't there anybody here at\nall?\"\n\"Nobody.\"\nWinnie-the-Pooh took his head out of the hole,\nand thought for a little, and he thought to himself,\n\"There must be somebody there, because somebody\nmust have said 'Nobody.'\" So he put his head back\nin the hole, and said:\nDigitized by\nGoogle\nPOOH GOES VISITING\n\"Hallo, Rabbit, isn't that you?\"\n\"No,\" said Rabbit, in a different sort of voice this\ntime.\n\"But isn't that Rabbit's voice?\"\n\"I don't think so,\" said Rabbit. \"It isn't meant\nto be.\"\n\"Oh!\" said Pooh.\nHe took his head out of the hole, and had another\nthink, and then he put it back, and said:\n\"Well, could you very kindly tell me where Rab-\nbit is?\"\n\"He has gone to see his friend Pooh Bear, who is\na great friend of his.\"\n\"But this is Me!\" said Bear, very much surprised.\n\"What sort of Me?\"\n\"Pooh Bear.\"\n\"Are you sure?\" said Rabbit, still more surprised,\n\"Quite, quite sure,\" said Pooh.\n\"Oh, well, then, come in.\"\nCOBDE\n23\nDigitized by\nGoogle\n24\nSo Pooh pushed and pushed and pushed his way\nthrough the hole, and at last he got in.\n\"You were quite right,\" said Rabbit, looking at\nhim all over. \"It is you. Glad to see you.\"\nWINNIE-THE-POOH\n\"Who did you think it was?”\n\"Well, I wasn't sure. You know how it is in the\nForest. One can't have anybody coming into one's\nhouse. One has to be careful. What about a mouth-\nful of something?\"\nPooh always liked a little something at eleven\no'clock in the morning, and he was very glad to see\nRabbit getting out the plates and mugs; and when\nRabbit said, \"Honey or condensed milk with your\nbread?\" he was so excited that he said, \"Both,\" and\nthen, so as not to seem greedy, he added, “But don't\nbother about the bread, please.\" And for a long\ntime after that he said nothing until at last,\nhumming to himself in a rather sticky voice, he got\nup, shook Rabbit lovingly by the paw, and said that\nhe must be going on.\n● ●\n\"Must you?\" said Rabbit politely.\n\"Well,\" said Pooh, \"I could stay a little longer if\nit-if you-\" and he tried very hard to look in the\ndirection of the larder.\nDigitized by\n\"As a matter of fact,” said Rabbit, \"I was going\nout myself directly.\"\n\"Oh, well, then, I'll be going on. Good-bye.\"\n\"Well, good-bye, if you're sure you won't have\nany more.\"\nGoogle\nPOOH GOES VISITING\n\"Is there any more?\" asked Pooh quickly.\nRabbit took the covers off the dishes, and said,\n\"No, there wasn't.\"\n\"I thought not,\" said Pooh, nodding to himself.\n\"Well, good-bye. I must be going on.\"\n\"\"\nSo he started to climb out of the hole. He pulled\nwith his front paws, and pushed with his back paws,\nand in a little while his nose was out in the open\nagain... and then his ears . . . and then his front\npaws... and then his shoulders... and then--\n25\n\"Oh, help!\" said Pooh. \"I'd better go back.\"\n\"Oh, bother!\" said Pooh. \"I shall have to go on.\"\n\"I can't do either!\" said Pooh. \"Oh, help and\nbother!\"\nNow by this time Rabbit wanted to go for a\nwalk too, and finding the front door full, he went.\nDigitized by Google\n26\nWINNIE-THE-POOH\nout by the back door, and came round to Pooh, and\nlooked at him.\n\"Hallo, are you stuck?\" he asked.\ndo\n\"N-no,\" said Pooh carelessly. \"Just resting and\nthinking and humming to myself.\"\n\"Here, give us a paw.\"\nPooh Bear stretched out a paw, and Rabbit pulled\nand pulled and pulled. ...\n\"Ow!\" cried Pooh. \"You're hurting!\"\n\"The fact is,\" said Rabbit, \"you're stuck.\"\n\"It all comes,\" said Pooh crossly, \"of not having\nfront doors big enough.\"\n\"It all comes,\" said Rabbit sternly, \"of eating too\nmuch. I thought at the time,\" said Rabbit, \"only I\nDigitized by\nGoogle\n27\nPOOH GOES VISITING\ndidn't like to say anything,\" said Rabbit, \"that one\nof us was eating too much,\" said Rabbit, \"and I\nknew it wasn't me,\" he said. \"Well, well, I shall go\nand fetch Christopher Robin.\"\nChristopher Robin lived at the other end of the\nForest, and when he came back with Rabbit, and\nsaw the front half of Pooh, he said, \"Silly old Bear,\"\nin such a loving voice that everybody felt quite\nhopeful again.\n\"I was just beginning to think,\" said Bear, sniffing\nslightly, \"that Rabbit might never be able to use his\nfront door again. And I should hate that,\" he said.\n\"So should I,\" said Rabbit.\n\"Use his front door again?\" said Christopher\nRobin. \"Of course he'll use his front door again.\"\n\"Good,\" said Rabbit.\n\"If we can't pull you out, Pooh, we might push\nyou back.\"\nRabbit scratched his whiskers thoughtfully,, and\npointed out that, when once Pooh was pushed\nback, he was back, and of course nobody was more\nglad to see Pooh than he was, still there it was, some\nlived in trees and some lived underground, and-\n\"You mean I'd never get out?\" said Pooh.\n\"I mean,\" said Rabbit, “that having got so far, it\nseems a pity to waste it.\"\nChristopher Robin nodded.\n\"Then there's only one thing to be done,” he said.\n\"We shall have to wait for you to get thin again.\"\nDigitized by\nGoogle\n"} \ No newline at end of file diff --git a/tests/unit/resources/unordered_shards/Winnie_the_Pooh_50_Pages-2.json b/tests/unit/resources/unordered_shards/Winnie_the_Pooh_50_Pages-2.json new file mode 100644 index 00000000..768ca33d --- /dev/null +++ b/tests/unit/resources/unordered_shards/Winnie_the_Pooh_50_Pages-2.json @@ -0,0 +1 @@ +{"pages":[{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1586},{"x":1586,"y":2521},{"y":2521}]},"confidence":0.97234923,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"141"}]}},"pageNumber":1},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1587},{"x":1587,"y":2519},{"y":2519}]},"confidence":0.98454177,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"1229","startIndex":"141"}]}},"pageNumber":2},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1588},{"x":1588,"y":2518},{"y":2518}]},"confidence":0.97692269,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"2753","startIndex":"1229"}]}},"pageNumber":3},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1589},{"x":1589,"y":2515},{"y":2515}]},"confidence":0.95404208,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"3544","startIndex":"2753"}]}},"pageNumber":4},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1591},{"x":1591,"y":2513},{"y":2513}]},"confidence":0.97564709,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"4464","startIndex":"3544"}]}},"pageNumber":5},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1592},{"x":1592,"y":2511},{"y":2511}]},"confidence":0.92242199,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"4956","startIndex":"4464"}]}},"pageNumber":6},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1581},{"x":1581,"y":2529},{"y":2529}]},"confidence":0.97620678,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"5553","startIndex":"4956"}]}},"pageNumber":7},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1588},{"x":1588,"y":2517},{"y":2517}]},"confidence":0.89793885,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"5788","startIndex":"5553"}]}},"pageNumber":8},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1587},{"x":1587,"y":2519},{"y":2519}]},"confidence":0.9834795,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"6318","startIndex":"5788"}]}},"pageNumber":9},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1590},{"x":1590,"y":2514},{"y":2514}]},"confidence":0.98179817,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"7105","startIndex":"6318"}]}},"pageNumber":10}],"shardInfo":{"shardCount":"5"},"text":"WINNIE-THE-POOH\nBY A. A. MILNE\nwith decorations\nby Ernest H. Shepard\nPUBLISHED BY\nE. P. DUTTON \u0026 CO., INC., NEW YORK\n123\nDigitized by\nGoogle\nIntroduction\n(I₂\nF YOU happen to have read another\nbook about Christopher Robin, you may remember\nthat he once had a swan (or the swan had Christopher\nRobin, I don't know which) and that he used to call\nthis swan Pooh. That was a long time ago, and when\nwe said good-bye, we took the name with us, as we\ndidn't think the swan would want it any more. Well,\nwhen Edward Bear said that he would like an exciting\nname all to himself, Christopher Robin said at once,\nwithout stopping to think, that he was Winnie-the-\nPooh. And he was. So, as I have explained the Pooh\npart, I will now explain the rest of it.\nYou can't be in London for long without going to\nthe Zoo. There are some people who begin the Zoo\nat the beginning, called WAYIN, and walk as quickly\nas they can past every cage until they get to the one\ncalled WAYOUT, but the nicest people go straight\nto the animal they love the most, and stay there. So\nwhen Christopher Robin goes to the Zoo, he goes to\nwhere the Polar Bears are, and he whispers something\nto the third keeper from the left, and doors are un-\nDigitized by\nGoogle\nviii\nlocked, and we wander through dark passages and up\nsteep stairs, until at last we come to the special cage,\nand the cage is opened, and out trots something brown\nand furry, and with a happy cry of \"Oh, Bear!\" Chris-\ntopher Robin rushes into its arms. Now this bear's\nname is Winnie, which shows what a good name for\nbears it is, but the funny thing is that we can't remem-\nber whether Winnie is called after Pooh, or Pooh after\nWinnie. We did know once, but we have forgot-\nWINNIE-THE-POOH\nten.\nI had written as far as this when Piglet looked up\nand said in his squeaky voice, “What about Me?\"\n\"My dear Piglet,\" I said, \"the whole book is about\nyou.\" \"So it is about Pooh,\" he squeaked. You see\nwhat it is. He is jealous because he thinks Pooh is hav-\ning a Grand Introduction all to himself. Pooh is the\nfavourite, of course, there's no denying it, but Piglet\ncomes in for a good many things which Pooh misses;\nbecause you can't take Pooh to school without every-\nbody knowing it, but Piglet is so small that he slips\ninto a pocket, where it is very comfortable to feel him\nwhen you are not quite sure whether twice seven is\ntwelve or twenty-two. Sometimes he slips out and has\na good look in the ink-pot, and in this way he has got\nmore education than Pooh, but Pooh doesn't mind.\nSome have brains, and some haven't, he says, and there\nit is.\nAnd now all the others are saying, \"What about\nUs?\" So perhaps the best thing to do is to stop writ-\ning Introductions and get on with the book. A. A. M.\nDigitized by\nGoogle\nIN WHICH We Are Introduced to\nCHAPTER I\nWinnie-the-Pooh and Some\nBees, and the Stories Begin\nHERE is Edward Bear, coming\ndownstairs now, bump, bump, bump, on the back\nof his head, behind Christopher Robin. It is, as far\nas he knows, the only way of coming downstairs,\nbut sometimes he feels that there really is another\nway, if only he could stop bumping for a moment\nand think of it. And then he feels that perhaps there\nisn't. Anyhow, here he is at the bottom, and ready\nto be introduced to you. Winnie-the-Pooh.\nWhen I first heard his name, I said, just as you\nare going to say, \"But I thought he was a boy?\"\n\"So did I,\" said Christopher Robin.\n\"Then you can't call him Winnie?\"\n\"I don't.\"\n\"But you said--\"\n\"He's Winnie-ther-Pooh. Don't you know what\n'ther' means?”\nI\nDigitized by\nGoogle\nWINNIE-THE-POOH\n“Ah, yes, now I do,\" I said quickly; and I hope\nyou do too, because it is all the explanation you are\ngoing to get.\nSometimes Winnie-the-Pooh likes a game of some\nsort when he comes downstairs, and sometimes he\nlikes to sit quietly in front of the fire and listen to a\nstory. This evening-\n\"What about a story?\" said Christopher Robin.\n\"What about a story?\" I said.\n\"Could you very sweetly tell Winnie-the-Pooh\none?\"\n\"I suppose I could,\" I said. \"What sort of stories\ndoes he like?\"\n\"About himself. Because he's that sort of Bear.\"\n\"Oh, I see.\"\n\"So could you very sweetly?\"\n\"I'll try,\" I said.\nSo I tried,\nOnce upon a time, a very long time ago now,\nabout last Friday, Winnie-the-Pooh lived in a forest\nall by himself under the name of Sande.s.\n(\"What does 'under the name' mean?\" asked\nChristopher Robin.\n\"It means he had the name over the door in gold\nletters, and lived under it.”\nDigitized by\nGoogle\nWE ARE INTRODUCED\nAB SANDER\nRNIG\nAALSO\n\"Winnie-the-Pooh wasn't quite sure,\" said Chris-\ntopher Robin.\n\"Now I am,\" said a growly voice.\n\"Then I will go on,\" said I.)\n3\nOne day when he was out walking, he came to\nan open place in the middle of the forest, and in the\nmiddle of this place was a large oak-tree, and, from\nthe top of the tree, there came a loud buzzing-noise.\nWinnie-the-Pooh sat down at the foot of the tree,\nput his head between his paws and began to think.\nDigitized by\nGoogle\n4\nWINNIE-THE-POOH\nFirst of all he said to himself: \"That buzzing-\nnoise means something. You don't get a buzzing-\nnoise like that, just buzzing and buzzing, without\nits meaning something. If there's a buzzing-noise,\nsomebody's making a buzzing-noise, and the only\nreason for making a buzzing-noise that I know of is\nbecause you're a bee.\"\nThen he thought another long time, and said:\n\"And the only reason for being a bee that I know\nof is making honey.\"\nAnd then he got up, and said: “And the only\nreason for making honey is so as I can eat it.\" So he\nbegan to climb the tree.\nDigitized by\nGoogle\nWE ARE INTRODUCED\nK\nDigitized by\nHe\nclimbed\nand\nhe\n5.\nclimbed\nand\nhe\nclimbed,\nand\nas\nhe\nclimbed\nhe\nsang\na\nlittle\nsong\nto\nhimself.\nIt\nwent\nlike\nthis:\nIsn't it funny\nHow a bear likes honey?\nBuzz! Buzz! Buzz!\nI wonder why he does?\nGoogle\n6\nWINNIE-THE-POOH\nThen he climbed a little further . . . and a little\nfurther ... and then just a little further. By that\ntime he had thought of another song.\nIt's a very funny thought that, if Bears were Bees,\nThey'd build their nests at the bottom of trees.\nAnd that being so (if the Bees were Bears),\nWe shouldn't have to climb up all these stairs.\nHe was getting\nrather tired by this\ntime, so that is why\nhe sang a Complain-\ning Song. He was\nnearly there now,\nand if he just stood\non that branch...\nCrack!\nDigitized by\nGoogle\nWE ARE INTRODUCED\n7\n\"Oh, help!\" said Pooh, as he dropped ten feet on\nthe branch below him.\n\"If only I hadn't--\" he said, as he bounced\ntwenty feet on to the next branch.\n\"You see, what I meant to do,\" he explained, as he\nturned head-over-heels, and crashed on to another\nbranch thirty feet below, \"what I meant to do--\"\n\"Of course, it was rather--\" he admitted, as he\n'ithered very quickly through the next six branches.\n\"It all comes, I suppose,\" he decided, as he said\ngood-bye to the last branch, spun round three times,\nand flew gracefully into a gorse-bush, \"it all comes\nof liking honey so much. Oh, help!\"\nHe crawled out of the gorse-bush, brushed the\nprickles from his nose, and began to think again.\nAnd the first person he thought of was Christopher\nRobin.\nDigitized by\nGoogle\n"} \ No newline at end of file diff --git a/tests/unit/resources/unordered_shards/Winnie_the_Pooh_50_Pages-3.json b/tests/unit/resources/unordered_shards/Winnie_the_Pooh_50_Pages-3.json new file mode 100644 index 00000000..bd33b2fb --- /dev/null +++ b/tests/unit/resources/unordered_shards/Winnie_the_Pooh_50_Pages-3.json @@ -0,0 +1 @@ +{"pages":[{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1586},{"x":1586,"y":2521},{"y":2521}]},"confidence":0.98529071,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"245"}]}},"pageNumber":11},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1591},{"x":1591,"y":2513},{"y":2513}]},"confidence":0.82016128,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"499","startIndex":"245"}]}},"pageNumber":12},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1588},{"x":1588,"y":2518},{"y":2518}]},"confidence":0.98127097,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"1174","startIndex":"499"}]}},"pageNumber":13},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1593},{"x":1593,"y":2510},{"y":2510}]},"confidence":0.98316687,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"2297","startIndex":"1174"}]}},"pageNumber":14},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1592},{"x":1592,"y":2511},{"y":2511}]},"confidence":0.94795507,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"3288","startIndex":"2297"}]}},"pageNumber":15},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1604},{"x":1604,"y":2492},{"y":2492}]},"confidence":0.97641647,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"3962","startIndex":"3288"}]}},"pageNumber":16},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1586},{"x":1586,"y":2520},{"y":2520}]},"confidence":0.97984999,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"4847","startIndex":"3962"}]}},"pageNumber":17},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1593},{"x":1593,"y":2509},{"y":2509}]},"confidence":0.98226541,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"5269","startIndex":"4847"}]}},"pageNumber":18},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1587},{"x":1587,"y":2519},{"y":2519}]},"confidence":0.97736621,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"6175","startIndex":"5269"}]}},"pageNumber":19},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1590},{"x":1590,"y":2514},{"y":2514}]},"confidence":0.97455043,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"6527","startIndex":"6175"}]}},"pageNumber":20}],"shardInfo":{"shardCount":"5","shardIndex":"1","textOffset":"7105"},"text":"WINNIE-THE-POOH\n(\"Was that me?\" said Christopher Robin in an\nawed voice, hardly daring to believe it.\n\"That was you.\"\n8\nChristopher Robin said nothing, but his eyes got\nlarger and larger, and his face got pinker and\npinker.)\nDigitized by\nGoogle\nWE ARE INTRODUCED\n9\nSo Winnie-the-Pooh went round to his friend\nChristopher Robin, who lived behind a green door\nin another part of the forest.\n\"Good morning, Christopher Robin,\" he said.\n\"Good morning, Winnie-ther-Pooh,\" said you.\nH\nDigitized by\nGoogle\n10\nWINNIE-THE-POOH\n\"I wonder if you've got such a thing as a balloon\nabout you?\"\n\"A balloon?\"\n\"Yes, I just said to myself coming along: 'I won-\nder if Christopher Robin has such a thing as a bal-\nloon about him?' I just said it to myself, thinking of\nballoons, and wondering.\"\n\"What do you want a balloon for?\" you said.\nWinnie-the-Pooh looked round to see that no-\nbody was listening, put his paw to his mouth, and\nsaid in a deep whisper: \"Honey!\"\n\"But you don't get honey with balloons!\"\n\"I do,\" said Pooh.\nWell, it just happened that you had been to a\nparty the day before at the house of your friend\nPiglet, and you had balloons at the party. You had\nDigitized by\nGoogle\nWE ARE INTRODUCED\nII\nhad a big green balloon; and one of Rabbit's rela-\ntions had had a big blue one, and had left it behind,\nbeing really too young to go to a party at all; and\nso you had brought the green one and the blue one\nhome with you.\n\"Which one would you like?\" you asked Pooh.\nHe put his head between his paws and thought\nvery carefully.\n\"It's like this,\" he said. \"When you go after honey\nwith a balloon, the great thing is not to let the bees\nknow you're coming. Now, if you have a green\nballoon, they might think you were only part of\nthe tree, and not notice you, and if you have a blue\nballoon, they might think you were only part of\nthe sky, and not notice you, and the question is:\nWhich is most likely?\"\n\"Wouldn't they notice you underneath the bal-\nloon?\"\nyou asked.\n\"They might or they might not,\" said Winnie-\nthe-Pooh. \"You never can tell with bees.\" He\nthought for a moment and said: \"I shall try to look\nlike a small black cloud. That will deceive them.\"\nThen you had better have the blue balloon,\" you\nsaid; and so it was decided.\nWell, you both went out with the blue balloon,\nDigitized by\nGoogle\n12\nWINNIE-THE-POOH\nand you took your gun with you, just in case, as\nyou always did, and Winnie-the-Pooh went to a\nvery muddy place that he knew of, and rolled and\nrolled until he was black all over; and then, when\nthe balloon was blown up as big as big, and\nyou and\nPooh were both holding on to the string, you let\ngo suddenly, and Pooh Bear floated gracefully up\ninto the sky, and stayed there-level with the top of\nthe tree and about twenty feet away from it.\n\"Hooray!\" you shouted.\n\"Isn't that fine?\" shouted Winnie-the-Pooh down\n\"What do I look like?\"\nto\nyou.\n\"You look like a Bear holding on to a balloon,\"\nYou said.\n\"Not,\" said Pooh anxiously, \"-not like a small\nblack cloud in a blue sky?\"\n\"Not very much.\"\n*Ah, well, perhaps from up here it looks different.\nAnd, as I say, you never can tell with bees.\"\nThere was no wind to blow him nearer to the\ntree, so there he stayed. He could see the honey, he\ncould smell the honey, but he couldn't quite reach\nthe honey.\nDigitized by\nGoogle\nWE ARE INTRODUCED\nAfter a little while he called down to you.\n\"Christopher Robin!\" he said in a loud whisper.\n\"Hallo!\"\n\"I think the bees suspect something!\"\n\"What sort of thing?\"\n\"I don't know. But something tells me that they're\nsuspicious!\"\n\"Perhaps they think that you're after their honey.\"\n\"It may be that. You never can tell with bees.\"\nThere was another little silence, and then he called\ndown to you again.\n\"Christopher Robin!\"\n\"Yes?\"\n“Have you an umbrella in your house?\"\n\"I think so.\"\n13\n\"I wish you would bring it out here, and walk up\nand down with it, and look up at me every now and\nthen, and say 'Tut-tut, it looks like rain.' I think, if\nDigitized by\nGoogle\n14\nyou did that, it would help the deception which we\nare practising on these bees.\"\nWINNIE-THE-POOH\nWell, you laughed to yourself, \"Silly old Bear!\"\nbut didn't say\nyou\nit aloud because you were so\nfond of him, and you went home for your umbrella.\n\"Oh, there you are!\" called down Winnie-the-\nPooh, as soon as you got back to the tree. \"I was\nbeginning to get anxious. I have discovered that the\nbees are now definitely Suspicious.\"\n\"Shall I put my umbrella up?\" you said.\n\"Yes, but wait a moment. We must be practical.\nThe important bee to deceive is the Queen Bee.\nCan you see which is the Queen Bee from down\nthere?\"\n\"No.\"\n\"A pity. Well, now, if you walk up and down\nwith your umbrella, saying, 'Tut-tut, it looks like\nrain,' I shall do what I can by singing a little Cloud\nSong, such as a cloud might sing.... Go!\"\nSo, while you walked up and down and wondered\nDigitized by\nGoogle\nWE ARE INTRODUCED\n15\nif it would rain, Winnie-the-Pooh sang this song:\nHow sweet to be a Cloud\nFloating in the Blue!\nEvery little cloud\nAlways sings aloud.\n\"How sweet to be a Cloud\nFloating in the Blue!\"\nIt makes him very proud\nTo be a little cloud.\nThe bees were still buzzing as suspiciously as\never. Some of them, indeed, left their nest and flew\nall round the cloud as it began the second verse of\nDigitized by\nGoogle\n16\nWINNIE-THE-POOH\nthis song, and one bee sat down on the nose of the\ncloud for a moment, and then got up again.\n\"Christopher-ow!-Robin,\"\ncalled out the cloud.\n\"Yes?\"\n\"I have just been thinking, and I have come to a\nvery important decision. These are the wrong sort\nof bees.\"\n\"Are they?\"\n\"Quite the wrong sort. So I should think they\nwould make the wrong sort of honey, shouldn't\nyou?\"\n\"Would they?\"\n\"Yes. So I think I shall come down.\"\n\"How?\" asked you.\nlet\nWinnie-the-Pooh hadn't thought about this. If he\ngo of the string, he would fall-bump-and he\ndidn't like the idea of that. So he thought for a long\ntime, and then he said:\n\"Christopher Robin, you must shoot the balloon\nwith your gun. Have you got your gun?\"\n\"Of course I have,\" you said. \"But if I do that, it\nwill spoil the balloon,” you said.\n\"But if you don't,\" said Pooh, \"I shall have to let\ngo, and that would spoil me.\"\nDigitized by\nGoogle\nWE ARE INTRODUCED\nWhen he put it like this, you saw how it was,\nand you aimed very carefully at the balloon, and\nfired.\n17\n\"Ow!\" said Pooh.\n\"Did I miss?\" you asked.\n\"You didn't exactly miss,\" said Pooh, \"but you\nmissed the balloon.”\n\"I'm so sorry,\" you said, and you fired again, and\nthis time you hit the balloon, and the air came\nDigitized by\nGoogle\n"} \ No newline at end of file diff --git a/tests/unit/resources/unordered_shards/Winnie_the_Pooh_50_Pages-4.json b/tests/unit/resources/unordered_shards/Winnie_the_Pooh_50_Pages-4.json new file mode 100644 index 00000000..373e4c9a --- /dev/null +++ b/tests/unit/resources/unordered_shards/Winnie_the_Pooh_50_Pages-4.json @@ -0,0 +1 @@ +{"pages":[{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1602},{"x":1602,"y":2496},{"y":2496}]},"confidence":0.98292428,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"1045"}]}},"pageNumber":31},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1591},{"x":1591,"y":2512},{"y":2512}]},"confidence":0.86585504,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"1271","startIndex":"1045"}]}},"pageNumber":32},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1594},{"x":1594,"y":2509},{"y":2509}]},"confidence":0.85440701,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"1640","startIndex":"1271"}]}},"pageNumber":33},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1613},{"x":1613,"y":2479},{"y":2479}]},"confidence":0.78438234,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"2049","startIndex":"1640"}]}},"pageNumber":34},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1601},{"x":1601,"y":2498},{"y":2498}]},"confidence":0.96820927,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"2917","startIndex":"2049"}]}},"pageNumber":35},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1601},{"x":1601,"y":2496},{"y":2496}]},"confidence":0.96317434,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"3222","startIndex":"2917"}]}},"pageNumber":36},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1605},{"x":1605,"y":2491},{"y":2491}]},"confidence":0.82596785,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"3903","startIndex":"3222"}]}},"pageNumber":37},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1607},{"x":1607,"y":2488},{"y":2488}]},"confidence":0.71312118,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"4071","startIndex":"3903"}]}},"pageNumber":38},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1598},{"x":1598,"y":2501},{"y":2501}]},"confidence":0.98653054,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"5343","startIndex":"4071"}]}},"pageNumber":39},{"layout":{"boundingPoly":{"normalizedVertices":[{},{"x":1},{"x":1,"y":1},{"y":1}],"vertices":[{},{"x":1606},{"x":1606,"y":2489},{"y":2489}]},"confidence":0.98035669,"orientation":"PAGE_UP","textAnchor":{"textSegments":[{"endIndex":"6021","startIndex":"5343"}]}},"pageNumber":40}],"shardInfo":{"shardCount":"5","shardIndex":"3","textOffset":"21680"},"text":"28\nWINNIE-THE-POOH\n\"How long does getting thin take?\" asked Pooh\nanxiously.\n\"About a week, I should think.\"\n\"But I can't stay here for a week!\"\n\"You can stay here all right, silly old Bear. It's\ngetting you out which is so difficult.\"\n\"We'll read to you,\" said Rabbit cheerfully. \"And\nI hope it won't snow,” he added. “And I say, old\nfellow, you're taking up a good deal of room in my\nhouse-do you mind if I use your back legs as a\ntowel-horse? Because, I mean, there they are-doing\nnothing-and it would be very convenient just to\nhang the towels on them.\"\n\"A week!\" said Pooh gloomily. \"What about\nmeals?\"\n\"I'm afraid no meals,\" said Christopher Robin,\n\"because of getting thin quicker. But we will read\nto you.\"\nBear began to sigh, and then found he couldn't\nbecause he was so tightly stuck; and a tear rolled\ndown his eye, as he said:\n\"Then would you read a Sustaining Book, such as\nwould help and comfort a Wedged Bear in Great\nTightness?\"\nSo for a week Christopher Robin read that sort\nof book at the North end of Pooh,\nDigitized by\nGoogle\nPOOH GOES VISITING\nand Rabbit\nرمه\nEle\nhung his washing on the South end... and in be-\ntween Bear felt himself getting slenderer and slen-\nderer. And at the end of the week Christopher\nRobin said, \"Now!\"\nDigitized by\n29\nGoogle\n30\nIDL A\n4/4\nJU\nWINNIE-THE-POOH\nSo he took hold of Pooh's\nfront paws and Rabbit took\nhold of Christopher Robin,\nand all Rabbit's friends and\nrelations took hold of Rabbit,\nand they all pulled together.\nAnd for a long time Pooh\nonly said \"Ow!\" ..\nAnd \"Oh!\" ...\nAnd then, all of a sudden,\nhe said \"Pop!\" just as if a\ncork were coming out of a\nbottle.\nDigitized by\nGoogle\nPOOH GOES VISITING\nAnd Christopher Robin and\nRabbit and all Rabbit's friends\nand relations went head-over-\nheels backwards... and on the\ntop of them came Winnie-the-\nPooh-free!\nSo, with a nod of thanks to\nhis friends, he went on with\nhis walk through the forest,\nhumming proudly to himself.\nBut, Christopher Robin looked\nafter him lovingly, and said to\nhimself, \"Silly old Bear!\"\nDigitized by\nGoogle\n31\nR\n*\na\nCHAPTER III\nIN WHICH Pooh and Piglet Go Hunting\nand Nearly Catch a Woozle\nT\n.HE PIGLET lived in a very grand\nhouse in the middle of a beech-tree, and the beech-\ntree was in the middle of the forest, and the Piglet\nlived in the middle of the house. Next to his house\nwas a piece of broken board which had: \"TRES-\nPASSERS W” on it. When Christopher Robin\nasked the Piglet what it meant, he said it was his\ngrandfather's name, and had been in the family for\na long time. Christopher Robin said you couldn't\nbe called Trespassers W, and Piglet said yes, you\ncould, because his grandfather was, and it was short\nfor Trespassers Will, which was short of Tres-\npassers William. And his grandfather had had two\nnames in case he lost one-Trespassers after an\nuncle, and William after Trespassers.\n\"I've got two names,\" said Christopher Robin\ncarelessly.\n32\nDigitized by\nGoogle\nPOOH AND PIGLET HUNT\nTRESPASSERS\nWH\n\"Well, there you are, that proves it,\" said Piglet.\nOne fine winter's day when Piglet was brushing\naway the snow in front of his house, he happened\nto look up, and there was Winnie-the-Pooh. Pooh\nwas walking round and round in a circle, thinking\nDigitized by\n33\nGoogle\n34\nWINNIE-THE-POOH\nof something else, and when Piglet called to him, he\njust went on walking.\n\"Hallo!\" said Piglet, \"what are you doing?\"\n\"Hunting,\" said Pooh.\n\"Hunting what?\"\n\"Tracking something,\" said Winnie-the-Pooh very\nmysteriously.\n\"Tracking what?\" said Piglet, coming closer.\n\"That's just what I ask myself. I ask myself,\nWhat?\"\nWhat do you think you'll answer?\"\n\"I shall have to wait until I catch up with it,\" said\nWinnie-the-Pooh. \"Now, look there.\" He pointed\n10/04.\nMalinska\nto the ground in front of him. \"What do you see\nthere?\"\n\"Tracks,\" said Piglet. \"Paw-marks.\" He gave a\nlittle squeak of excitment. \"Oh, Pooh! Do you\nthink it's a-a-a Woozle?\"\nDigitized by\nGoogle\nOB\nThe\nFY\n£\nPOOH AND PIGLET HUNT\n4.0\nDigitized by\n35\n\"It may be,\" said Pooh. \"Sometimes it is, and\nsometimes it isn't. You never can tell with paw-\nmarks.\"\nGoogle\ntale\n36\nWINNIE-THE-POOH\nWith these few words he went on tracking, and\nPiglet, after watching him for a minute or two, ran\nafter him. Winnie-the-Pooh had come to a sudden\nstop, and was bending over the tracks in a puzzled\nsort of way.\n\"What's the matter?\" asked Piglet.\n\"It's a very funny thing,\" said Bear, “but there\nseem to be two animals now. This-whatever-it-was\n-has been joined by another-whatever-it-is-and\nthe two of them are now proceeding in company.\nWould you mind coming with me, Piglet, in case\nthey turn out to be Hostile Animals?\"\nPiglet scratched his ear in a nice sort of way, and\nsaid that he had nothing to do until Friday, and\nwould be delighted to come, in case it really was a\nWoozle.\n\"You mean, in case it really is two Woozles,\" said\nWinnie-the-Pooh, and Piglet said that anyhow he\nhad nothing to do until Friday. So off they went\ntogether.\nThere was a small spinney of larch trees just\nhere, and it seemed as if the two Woozles, if that is\nwhat they were, had been going round this spin-\nney; so round this spinney went Pooh and Piglet\nafter them; Piglet passing the time by telling Pooh\nwhat his Grandfather Trespassers W had done to\nRemove Stiffness after Tracking, and how his\nGrandfather Trespassers W had suffered in his later\nDigitized by\nGoogle\nPOOH AND PIGLET HUNT\n37\nyears from Shortness of Breath, and other matters\nof interest, and Pooh wondering what a Grand-\nfather was like, and if perhaps this was Two Grand-\nfathers they were after now, and, if so, whether he\nwould be allowed to take one home and keep it,\nand what Christopher Robin would say. And still\nthe tracks went on in front of them....\nSuddenly Winnie-the-Pooh stopped, and pointed\nexcitedly in front of him. \"Look!\"\n\"What?\" said Piglet, with a jump. And then, to\nshow that he hadn't been frightened, he jumped up\nand down once or twice in an exercising sort of\nway.\n\"The tracks!\" said Pooh. “A third animal has\njoined the other two!”\nDigitized by\nGoogle\n"} \ No newline at end of file diff --git a/tests/unit/test_bbox_conversion.py b/tests/unit/test_bbox_conversion.py new file mode 100644 index 00000000..c701debd --- /dev/null +++ b/tests/unit/test_bbox_conversion.py @@ -0,0 +1,264 @@ +from google.cloud import documentai +from google.cloud.documentai_v1.types import geometry +from google.cloud.documentai_toolbox.converters.config import bbox_conversion, blocks + + +def test_midpoint_in_bpoly(): + vertex_a = geometry.NormalizedVertex(x=2, y=2) + box_a = geometry.BoundingPoly(normalized_vertices=[vertex_a]) + + vertex_b = geometry.NormalizedVertex(x=1, y=1) + vertex_b_max = geometry.NormalizedVertex(x=4, y=4) + box_b = geometry.BoundingPoly(normalized_vertices=[vertex_b, vertex_b_max]) + + actual = bbox_conversion._midpoint_in_bpoly(box_a=box_a, box_b=box_b) + assert actual + + +def test_merge_text_anchors(): + text_segment_1 = documentai.Document.TextAnchor.TextSegment( + start_index="0", end_index="100" + ) + text_anchor_1 = documentai.Document.TextAnchor(text_segments=[text_segment_1]) + + text_segment_2 = documentai.Document.TextAnchor.TextSegment( + start_index="100", end_index="200" + ) + text_anchor_2 = documentai.Document.TextAnchor(text_segments=[text_segment_2]) + + text_segment_3 = documentai.Document.TextAnchor.TextSegment( + start_index="0", end_index="200" + ) + expected = documentai.Document.TextAnchor(text_segments=[text_segment_3]) + actual = bbox_conversion._merge_text_anchors( + text_anchor_1=text_anchor_1, text_anchor_2=text_anchor_2 + ) + assert actual == expected + + +def test_get_text_anchor_in_bbox(): + vertex_a = geometry.NormalizedVertex(x=2, y=2) + vertex_a_max = geometry.NormalizedVertex(x=5, y=5) + box_a = geometry.BoundingPoly(normalized_vertices=[vertex_a, vertex_a_max]) + + vertex_b = geometry.NormalizedVertex(x=1, y=1) + vertex_b_max = geometry.NormalizedVertex(x=8, y=8) + box_b = geometry.BoundingPoly(normalized_vertices=[vertex_b, vertex_b_max]) + + text_segment_1 = documentai.Document.TextAnchor.TextSegment( + start_index="0", end_index="100" + ) + text_anchor_1 = documentai.Document.TextAnchor(text_segments=[text_segment_1]) + + text_segment_2 = documentai.Document.TextAnchor.TextSegment( + start_index="100", end_index="200" + ) + text_anchor_2 = documentai.Document.TextAnchor(text_segments=[text_segment_2]) + + layout1 = documentai.Document.Page.Layout( + bounding_poly=box_b, text_anchor=text_anchor_1 + ) + layout2 = documentai.Document.Page.Layout( + bounding_poly=box_b, text_anchor=text_anchor_2 + ) + + token1 = documentai.Document.Page.Token(layout=layout1) + token2 = documentai.Document.Page.Token(layout=layout2) + + page = documentai.Document.Page(tokens=[token1, token2]) + actual = bbox_conversion._get_text_anchor_in_bbox(bbox=box_a, page=page) + + text_segment_3 = documentai.Document.TextAnchor.TextSegment( + start_index="0", end_index="200" + ) + expected = documentai.Document.TextAnchor(text_segments=[text_segment_3]) + assert actual == expected + + +def test_get_norm_x_max(): + vertex_a_min = geometry.NormalizedVertex(x=2, y=2) + vertex_a_max = geometry.NormalizedVertex(x=4, y=4) + + bbox = geometry.BoundingPoly(normalized_vertices=[vertex_a_min, vertex_a_max]) + actual = bbox_conversion._get_norm_x_max(bbox=bbox) + assert actual == 4 + + +def test_get_norm_x_min(): + vertex_a_min = geometry.NormalizedVertex(x=2, y=2) + vertex_a_max = geometry.NormalizedVertex(x=4, y=4) + + bbox = geometry.BoundingPoly(normalized_vertices=[vertex_a_min, vertex_a_max]) + actual = bbox_conversion._get_norm_x_min(bbox=bbox) + assert actual == 2 + + +def test_get_norm_y_max(): + vertex_a_min = geometry.NormalizedVertex(x=2, y=2) + vertex_a_max = geometry.NormalizedVertex(x=4, y=4) + + bbox = geometry.BoundingPoly(normalized_vertices=[vertex_a_min, vertex_a_max]) + actual = bbox_conversion._get_norm_y_min(bbox=bbox) + assert actual == 2 + + +def test_get_norm_y_min(): + vertex_a_min = geometry.NormalizedVertex(x=2, y=2) + vertex_a_max = geometry.NormalizedVertex(x=4, y=4) + + bbox = geometry.BoundingPoly(normalized_vertices=[vertex_a_min, vertex_a_max]) + actual = bbox_conversion._get_norm_y_max(bbox=bbox) + assert actual == 4 + + +def test_normalize_coordinates(): + actual = bbox_conversion._normalize_coordinates(x=4.0, y=2.0) + assert actual == 2.0 + + +def test_convert_to_pixels(): + actual = bbox_conversion._convert_to_pixels(x=1, conversion_rate=96) + assert actual == 96 + + +def test_convert_bbox_units_with_normalized(): + actual = bbox_conversion._convert_bbox_units( + coordinate=0.5, input_bbox_units="normalized", width=2550, height=3300 + ) + assert actual == 0.5 + + +def test_convert_bbox_units_with_pxl(): + actual = bbox_conversion._convert_bbox_units( + coordinate=1, input_bbox_units="pxl", width=2550, height=3300 + ) + assert actual == 0.000392157 + + +def test_convert_bbox_units_with_inch(): + actual = bbox_conversion._convert_bbox_units( + coordinate=1, input_bbox_units="inch", width=2550, height=3300 + ) + assert actual == 0.037647059 + + +def test_convert_bbox_units_with_cm(): + actual = bbox_conversion._convert_bbox_units( + coordinate=1, input_bbox_units="cm", width=2550, height=3300 + ) + assert actual == 0.014821569 + + +def test_get_multiplier_pxl(): + actual = bbox_conversion._get_multiplier( + docproto_coordinate=1000, external_coordinate=1000, input_bbox_units="pxl" + ) + assert actual == 1.0 + + +def test_get_multiplier_inch(): + actual = bbox_conversion._get_multiplier( + docproto_coordinate=1000, external_coordinate=10.416, input_bbox_units="inch" + ) + assert actual == 1.000064004096262 + + +def test_get_multiplier_cm(): + actual = bbox_conversion._get_multiplier( + docproto_coordinate=1000, external_coordinate=26.4585, input_bbox_units="cm" + ) + assert actual == 1.000000992500985 + + +def test_convert_bbox_to_docproto_bbox_empty_coordinate(): + docproto = documentai.Document() + page = documentai.Document.Page() + dimensions = documentai.Document.Page.Dimension() + dimensions.width = 2550 + dimensions.height = 3300 + page.dimension = dimensions + docproto.pages = [page] + with open("tests/unit/resources/converters/test_type_1.json", "r") as (f): + invoice = f.read() + with open("tests/unit/resources/converters/test_config_type_1.json", "r") as (f): + config = f.read() + b = blocks._load_blocks_from_schema( + input_data=invoice, input_config=config, base_docproto=docproto + ) + b[0].bounding_box = [] + + actual = bbox_conversion._convert_bbox_to_docproto_bbox(block=(b[0])) + + assert actual == [] + + +def test_convert_bbox_to_docproto_bbox_type_1(): + docproto = documentai.Document() + page = documentai.Document.Page() + dimensions = documentai.Document.Page.Dimension() + dimensions.width = 2550 + dimensions.height = 3300 + page.dimension = dimensions + docproto.pages = [page] + with open("tests/unit/resources/converters/test_type_1.json", "r") as (f): + invoice = f.read() + with open("tests/unit/resources/converters/test_config_type_1.json", "r") as (f): + config = f.read() + b = blocks._load_blocks_from_schema( + input_data=invoice, input_config=config, base_docproto=docproto + ) + actual = bbox_conversion._convert_bbox_to_docproto_bbox(block=(b[0])) + + assert actual.normalized_vertices != [] + assert actual.vertices == [] + assert "x" in str(actual.normalized_vertices) + assert "y" in str(actual.normalized_vertices) + + +def test_convert_bbox_to_docproto_bbox_type_2(): + docproto = documentai.Document() + page = documentai.Document.Page() + dimensions = documentai.Document.Page.Dimension() + dimensions.width = 2550 + dimensions.height = 3300 + page.dimension = dimensions + docproto.pages = [page] + with open("tests/unit/resources/converters/test_type_2.json", "r") as (f): + invoice = f.read() + with open("tests/unit/resources/converters/test_config_type_2.json", "r") as (f): + config = f.read() + b = blocks._load_blocks_from_schema( + input_data=invoice, input_config=config, base_docproto=docproto + ) + actual = bbox_conversion._convert_bbox_to_docproto_bbox(block=(b[0])) + + assert actual.normalized_vertices != [] + assert actual.vertices == [] + assert "x" in str(actual.normalized_vertices) + assert "y" in str(actual.normalized_vertices) + + +def test_convert_bbox_to_docproto_bbox_type_3(): + docproto = documentai.Document() + page = documentai.Document.Page() + dimensions = documentai.Document.Page.Dimension() + dimensions.width = 2550 + dimensions.height = 3300 + page.dimension = dimensions + docproto.pages = [page] + with open("tests/unit/resources/converters/test_type_3.json", "r") as (f): + invoice = f.read() + with open("tests/unit/resources/converters/test_config_type_3.json", "r") as (f): + config = f.read() + b = blocks._load_blocks_from_schema( + input_data=invoice, input_config=config, base_docproto=docproto + ) + + print(b[0].bounding_type) + + actual = bbox_conversion._convert_bbox_to_docproto_bbox(block=(b[0])) + + assert actual.normalized_vertices != [] + assert actual.vertices == [] + assert "x" in str(actual.normalized_vertices) + assert "y" in str(actual.normalized_vertices) diff --git a/tests/unit/test_blocks.py b/tests/unit/test_blocks.py new file mode 100644 index 00000000..9e1ac238 --- /dev/null +++ b/tests/unit/test_blocks.py @@ -0,0 +1,126 @@ +from google.cloud import documentai +from google.cloud.documentai_toolbox.converters.config import blocks + + +def test_create(): + actual = blocks.Block.create( + type_="test_type", + text="test_text", + bounding_box="", + block_references="", + block_id="", + confidence="", + page_number="", + page_width="", + page_height="", + bounding_width="", + bounding_height="", + bounding_type="", + bounding_unit="", + bounding_x="", + bounding_y="", + docproto_width="", + docproto_height="", + ) + + assert actual.type_ == "test_type" + assert actual.text == "test_text" + + +def test_get_target_object(): + test_json_data = { + "document": {"entities": [{}, {"text": "test_text", "type": "test_type"}]} + } + + text = blocks._get_target_object( + json_data=test_json_data, target_object="document.entities.1.text" + ) + type = blocks._get_target_object( + json_data=test_json_data, target_object="document.entities.1.type" + ) + + assert text == "test_text" + assert type == "test_type" + + +def test_get_target_object_with_one_object(): + test_json_data = {"document": "document_test"} + + text = blocks._get_target_object(json_data=test_json_data, target_object="document") + + assert text == "document_test" + + +def test_get_target_object_without_target(): + test_json_data = { + "document": {"entities": [{}, {"text": "test_text", "type": "test_type"}]} + } + + text = blocks._get_target_object( + json_data=test_json_data, target_object="entities.text" + ) + + assert text is None + + +def test_load_blocks_from_scheme_type_1(): + docproto = documentai.Document() + page = documentai.Document.Page() + dimensions = documentai.Document.Page.Dimension() + dimensions.width = 2550 + dimensions.height = 3300 + page.dimension = dimensions + docproto.pages = [page] + with open("tests/unit/resources/converters/test_type_1.json", "r") as (f): + invoice = f.read() + with open("tests/unit/resources/converters/test_config_type_1.json", "r") as (f): + config = f.read() + + actual = blocks._load_blocks_from_schema( + input_data=invoice, input_config=config, base_docproto=docproto + ) + + assert actual[0].text == "411 I.T. Group" + assert actual[0].type_ == "BusinessName" + + +def test_load_blocks_from_scheme_type_2(): + docproto = documentai.Document() + page = documentai.Document.Page() + dimensions = documentai.Document.Page.Dimension() + dimensions.width = 2550 + dimensions.height = 3300 + page.dimension = dimensions + docproto.pages = [page] + with open("tests/unit/resources/converters/test_type_2.json", "r") as (f): + invoice = f.read() + with open("tests/unit/resources/converters/test_config_type_2.json", "r") as (f): + config = f.read() + + actual = blocks._load_blocks_from_schema( + input_data=invoice, input_config=config, base_docproto=docproto + ) + + assert actual[0].text == "4748" + assert actual[0].type_ == "invoice_id" + + +def test__load_blocks_from_schema_type_3(): + docproto = documentai.Document() + page = documentai.Document.Page() + dimensions = documentai.Document.Page.Dimension() + dimensions.width = 2550 + dimensions.height = 3300 + page.dimension = dimensions + docproto.pages = [page] + with open("tests/unit/resources/converters/test_type_3.json", "r") as (f): + invoice = f.read() + with open("tests/unit/resources/converters/test_config_type_3.json", "r") as (f): + config = f.read() + + actual = blocks._load_blocks_from_schema( + input_data=invoice, input_config=config, base_docproto=docproto + ) + + assert actual[0].text == "normalized 411 I.T. Group" + assert actual[0].type_ == "BusinessName" diff --git a/tests/unit/test_converter.py b/tests/unit/test_converter.py new file mode 100644 index 00000000..0be99429 --- /dev/null +++ b/tests/unit/test_converter.py @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +try: + from unittest import mock +except ImportError: # pragma: NO COVER + import mock + +from google.cloud.documentai_toolbox.converters import converter + + +@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +@mock.patch( + "google.cloud.documentai_toolbox.converters.config.converter_helpers._get_docproto_files", + return_value=(["file1"], ["test_label"], []), +) +@mock.patch( + "google.cloud.documentai_toolbox.converters.config.converter_helpers._upload", + return_value="Done", +) +def test__convert_documents_with_config( + mock_storage, mock_get_docproto_files, mock_upload, capfd +): + client = mock_storage.Client.return_value + mock_bucket = mock.Mock() + client.Bucket.return_value = mock_bucket + + mock_blob1 = mock.Mock(name="gs://test-directory/1/test-annotations.json") + mock_blob1.download_as_bytes.return_value = ( + "gs://test-directory/1/test-annotations.json" + ) + + mock_blob2 = mock.Mock(name="gs://test-directory/1/test-config.json") + mock_blob2.download_as_bytes.return_value = "gs://test-directory/1/test-config.json" + + mock_blob3 = mock.Mock(name="gs://test-directory/1/test.pdf") + mock_blob3.download_as_bytes.return_value = "gs://test-directory/1/test.pdf" + + client.list_blobs.return_value = [mock_blob1, mock_blob2, mock_blob3] + + converter.convert_from_config( + project_id="project-id", + location="location", + processor_id="project-id", + gcs_input_path="gs://test-directory/1", + gcs_output_path="gs://test-directory/1/output", + ) + + out, err = capfd.readouterr() + assert "test_label" in out diff --git a/tests/unit/test_converter_helpers.py b/tests/unit/test_converter_helpers.py new file mode 100644 index 00000000..845505c1 --- /dev/null +++ b/tests/unit/test_converter_helpers.py @@ -0,0 +1,490 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +try: + from unittest import mock +except ImportError: # pragma: NO COVER + import mock + +from google.cloud.documentai_toolbox.converters.config import blocks, converter_helpers +from google.cloud import documentai +import pytest + + +@mock.patch( + "google.cloud.documentai_toolbox.converters.config.converter_helpers.documentai" +) +def test_get_base_ocr(mock_docai): + mock_client = mock_docai.DocumentProcessorServiceClient.return_value + + mock_client.process_document.return_value.document = "Done" + + actual = converter_helpers._get_base_ocr( + project_id="project_id", + location="location", + processor_id="processor_id", + file_bytes="file", + mime_type="application/pdf", + ) + + mock_client.process_document.assert_called() + assert actual == "Done" + + +def test_get_entity_content_type_3(): + docproto = documentai.Document() + page = documentai.Document.Page() + dimensions = documentai.Document.Page.Dimension() + dimensions.width = 2550 + dimensions.height = 3300 + page.dimension = dimensions + docproto.pages = [page] + with open("tests/unit/resources/converters/test_type_3.json", "r") as (f): + invoice = f.read() + with open("tests/unit/resources/converters/test_config_type_3.json", "r") as (f): + config = f.read() + + b = blocks._load_blocks_from_schema( + input_data=invoice, input_config=config, base_docproto=docproto + ) + + actual = converter_helpers._get_entity_content(blocks=b, docproto=docproto) + + assert actual[0].type == "BusinessName" + assert actual[0].mention_text == "normalized 411 I.T. Group" + + +def test_get_entity_content_type_2(): + docproto = documentai.Document() + page = documentai.Document.Page() + dimensions = documentai.Document.Page.Dimension() + dimensions.width = 2550 + dimensions.height = 3300 + page.dimension = dimensions + docproto.pages = [page] + with open("tests/unit/resources/converters/test_type_2.json", "r") as (f): + invoice = f.read() + with open("tests/unit/resources/converters/test_config_type_2.json", "r") as (f): + config = f.read() + + b = blocks._load_blocks_from_schema( + input_data=invoice, input_config=config, base_docproto=docproto + ) + + actual = converter_helpers._get_entity_content(blocks=b, docproto=docproto) + + assert actual[0].type == "invoice_id" + assert actual[0].mention_text == "4748" + + +def test_get_entity_content_type_1(): + docproto = documentai.Document() + page = documentai.Document.Page() + dimensions = documentai.Document.Page.Dimension() + dimensions.width = 2550 + dimensions.height = 3300 + page.dimension = dimensions + docproto.pages = [page] + with open("tests/unit/resources/converters/test_type_1.json", "r") as (f): + invoice = f.read() + with open("tests/unit/resources/converters/test_config_type_1.json", "r") as (f): + config = f.read() + + b = blocks._load_blocks_from_schema( + input_data=invoice, input_config=config, base_docproto=docproto + ) + + actual = converter_helpers._get_entity_content(blocks=b, docproto=docproto) + + assert actual[0].type == "BusinessName" + assert actual[0].mention_text == "411 I.T. Group" + + +@mock.patch( + "google.cloud.documentai_toolbox.converters.config.converter_helpers._get_base_ocr" +) +def test_convert_to_docproto_with_config(mock_ocr): + docproto = documentai.Document() + page = documentai.Document.Page() + dimensions = documentai.Document.Page.Dimension() + dimensions.width = 2550 + dimensions.height = 3300 + page.dimension = dimensions + docproto.pages = [page] + mock_ocr.return_value = docproto + + with open("tests/unit/resources/converters/test_type_3.json", "rb") as (f): + invoice = f.read() + with open("tests/unit/resources/converters/test_config_type_3.json", "rb") as (f): + config = f.read() + with open("tests/unit/resources/toolbox_invoice_test.pdf", "rb") as (f): + pdf = f.read() + + actual = converter_helpers._convert_to_docproto_with_config( + name="test_document", + annotated_bytes=invoice, + config_bytes=config, + document_bytes=pdf, + project_id="project_id", + processor_id="processor_id", + location="location", + retry_number=0, + ) + + assert len(actual.pages) == 1 + assert len(actual.entities) == 1 + assert actual.entities[0].type == "BusinessName" + assert actual.entities[0].mention_text == "normalized 411 I.T. Group" + + +@mock.patch( + "google.cloud.documentai_toolbox.converters.config.converter_helpers._get_base_ocr" +) +def test_convert_to_docproto_with_config_with_error(mock_ocr, capfd): + mock_ocr.return_value = None + + with open("tests/unit/resources/converters/test_type_3.json", "rb") as (f): + invoice = f.read() + with open("tests/unit/resources/converters/test_config_type_3.json", "rb") as (f): + config = f.read() + with open("tests/unit/resources/toolbox_invoice_test.pdf", "rb") as (f): + pdf = f.read() + + actual = converter_helpers._convert_to_docproto_with_config( + name="test_document", + annotated_bytes=invoice, + config_bytes=config, + document_bytes=pdf, + project_id="project_id", + processor_id="processor_id", + location="location", + retry_number=6, + ) + + out, err = capfd.readouterr() + + assert actual is None + assert "Could Not Convert test_document" in out + + +@mock.patch( + "google.cloud.documentai_toolbox.converters.config.converter_helpers._get_base_ocr" +) +def test_convert_to_docproto_with_config_with_error_and_retry(mock_ocr, capfd): + mock_ocr.return_value = None + + with open("tests/unit/resources/converters/test_type_3.json", "rb") as (f): + invoice = f.read() + with open("tests/unit/resources/converters/test_config_type_3.json", "rb") as (f): + config = f.read() + with open("tests/unit/resources/toolbox_invoice_test.pdf", "rb") as (f): + pdf = f.read() + + actual = converter_helpers._convert_to_docproto_with_config( + name="test_document", + annotated_bytes=invoice, + config_bytes=config, + document_bytes=pdf, + project_id="project_id", + processor_id="processor_id", + location="location", + retry_number=5, + ) + + out, err = capfd.readouterr() + + assert actual is None + assert "Could Not Convert test_document" in out + + +@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +def test_get_bytes(mock_storage): + client = mock_storage.Client.return_value + mock_bucket = mock.Mock() + client.Bucket.return_value = mock_bucket + + mock_ds_store = mock.Mock(name=[]) + mock_ds_store.name = "DS_Store" + + mock_blob1 = mock.Mock(name=[]) + mock_blob1.name = "gs://test-directory/1/test-annotations.json" + mock_blob1.download_as_bytes.return_value = ( + "gs://test-directory/1/test-annotations.json" + ) + + mock_blob2 = mock.Mock(name=[]) + mock_blob2.name = "gs://test-directory/1/test-config.json" + mock_blob2.download_as_bytes.return_value = "gs://test-directory/1/test-config.json" + + mock_blob3 = mock.Mock(name=[]) + mock_blob3.name = "gs://test-directory/1/test.pdf" + mock_blob3.download_as_bytes.return_value = "gs://test-directory/1/test.pdf" + + client.list_blobs.return_value = [mock_ds_store, mock_blob1, mock_blob2, mock_blob3] + + actual = converter_helpers._get_bytes( + bucket_name="bucket", + prefix="prefix", + annotation_file_prefix="annotations", + config_file_prefix="config", + ) + + assert actual == [ + "gs://test-directory/1/test-annotations.json", + "gs://test-directory/1/test.pdf", + "gs://test-directory/1/test-config.json", + "prefix", + "test", + ] + + +@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +def test_get_bytes_with_error(mock_storage): + with pytest.raises(Exception, match="Fail"): + client = mock_storage.Client.return_value + mock_bucket = mock.Mock() + client.Bucket.return_value = mock_bucket + + mock_blob1 = mock.Mock(name=[]) + mock_blob1.name = "gs://test-directory/1/test-annotations.json" + mock_blob1.download_as_bytes.side_effect = Exception("Fail") + + client.list_blobs.return_value = [mock_blob1] + + converter_helpers._get_bytes( + bucket_name="bucket", + prefix="prefix", + annotation_file_prefix="annotations", + config_file_prefix="config", + ) + + +@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +def test_upload_file(mock_storage): + client = mock_storage.Client.return_value + + converter_helpers._upload_file( + bucket_name="bucket", output_prefix="prefix", file="file" + ) + client.bucket.return_value.blob.return_value.upload_from_string.assert_called_with( + "file", content_type="application/json" + ) + + +@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +@mock.patch( + "google.cloud.documentai_toolbox.converters.config.converter_helpers._get_bytes", + return_value="file_bytes", +) +def test_get_files(mock_storage, mock_get_bytes): + client = mock_storage.Client.return_value + mock_bucket = mock.Mock() + client.Bucket.return_value = mock_bucket + + mock_ds_store = mock.Mock(name=[]) + mock_ds_store.name = "DS_Store" + + mock_blob1 = mock.Mock(name=[]) + mock_blob1.name = "gs://test-directory/1/test-annotations.json" + mock_blob1.download_as_bytes.return_value = ( + "gs://test-directory/1/test-annotations.json" + ) + + mock_blob2 = mock.Mock(name=[]) + mock_blob2.name = "gs://test-directory/1/test-config.json" + mock_blob2.download_as_bytes.return_value = "gs://test-directory/1/test-config.json" + + mock_blob3 = mock.Mock(name=[]) + mock_blob3.name = "gs://test-directory/1/test.pdf" + mock_blob3.download_as_bytes.return_value = "gs://test-directory/1/test.pdf" + + blob_list = [mock_ds_store, mock_blob1, mock_blob2, mock_blob3] + + actual = converter_helpers._get_files( + blob_list=blob_list, input_prefix="", input_bucket="test-directory" + ) + + assert actual[0].result() == "file_bytes" + + +@mock.patch( + "google.cloud.documentai_toolbox.converters.config.converter_helpers._convert_to_docproto_with_config", +) +def test_get_docproto_files(mocked_convert_docproto): + + mock_result = mock.Mock() + mock_result.result.return_value = [ + "annotated_bytes", + "document_bytes", + "config_bytes", + "document_1", + ] + + document = documentai.Document() + entities = [documentai.Document.Entity(type_="test_type", mention_text="test_text")] + document.entities = entities + + mocked_convert_docproto.return_value = document + ( + actual_files, + actual_unique_types, + actual_did_not_convert, + ) = converter_helpers._get_docproto_files( + f=[mock_result], + project_id="project-id", + processor_id="processor-id", + location="us", + ) + assert "test_type" in actual_files["document_1"] + assert "test_text" in actual_files["document_1"] + assert "test_type" in actual_unique_types + mocked_convert_docproto.assert_called_with( + annotated_bytes="annotated_bytes", + document_bytes="document_bytes", + config_bytes="config_bytes", + project_id="project-id", + location="us", + processor_id="processor-id", + retry_number=1, + name="document_1", + ) + + +@mock.patch( + "google.cloud.documentai_toolbox.converters.config.converter_helpers._convert_to_docproto_with_config", +) +def test_get_docproto_files_with_no_docproto(mocked_convert_docproto): + + mock_result = mock.Mock() + mock_result.result.return_value = [ + "annotated_bytes", + "document_bytes", + "config_bytes", + "document_1", + ] + + mocked_convert_docproto.return_value = None + ( + actual_files, + actual_unique_types, + actual_did_not_convert, + ) = converter_helpers._get_docproto_files( + f=[mock_result], + project_id="project-id", + processor_id="processor-id", + location="us", + ) + assert "document_1" in actual_did_not_convert + mocked_convert_docproto.assert_called_with( + annotated_bytes="annotated_bytes", + document_bytes="document_bytes", + config_bytes="config_bytes", + project_id="project-id", + location="us", + processor_id="processor-id", + retry_number=1, + name="document_1", + ) + + +@mock.patch( + "google.cloud.documentai_toolbox.converters.config.converter_helpers._upload_file", +) +def test_upload(mock_upload_file): + files = {} + files["document_1"] = "Document" + converter_helpers._upload(files, gcs_output_path="gs://output/") + + mock_upload_file.assert_called_with("output", "/document_1.json", "Document") + + +def test_upload_with_format_error(): + with pytest.raises(ValueError, match="gcs_prefix does not match accepted format"): + files = {} + files["document_1"] = "Document" + converter_helpers._upload(files, gcs_output_path="output/path") + + +def test_upload_with_file_error(): + with pytest.raises(ValueError, match="gcs_prefix cannot contain file types"): + files = {} + files["document_1"] = "Document" + converter_helpers._upload(files, gcs_output_path="gs://output/path.json") + + +@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +@mock.patch( + "google.cloud.documentai_toolbox.converters.config.converter_helpers._get_docproto_files", + return_value=(["file1"], ["test_label"], ["document_2"]), +) +@mock.patch( + "google.cloud.documentai_toolbox.converters.config.converter_helpers._upload", + return_value="Done", +) +def test_convert_documents_with_config( + mock_storage, mock_get_docproto_files, mock_upload, capfd +): + client = mock_storage.Client.return_value + mock_bucket = mock.Mock() + client.Bucket.return_value = mock_bucket + + mock_blob1 = mock.Mock(name="gs://test-directory/1/test-annotations.json") + mock_blob1.download_as_bytes.return_value = ( + "gs://test-directory/1/test-annotations.json" + ) + + mock_blob2 = mock.Mock(name="gs://test-directory/1/test-config.json") + mock_blob2.download_as_bytes.return_value = "gs://test-directory/1/test-config.json" + + mock_blob3 = mock.Mock(name="gs://test-directory/1/test.pdf") + mock_blob3.download_as_bytes.return_value = "gs://test-directory/1/test.pdf" + + client.list_blobs.return_value = [mock_blob1, mock_blob2, mock_blob3] + + converter_helpers._convert_documents_with_config( + project_id="project-id", + location="location", + processor_id="project-id", + gcs_input_path="gs://test-directory/", + gcs_output_path="gs://test-directory-output/", + ) + + out, err = capfd.readouterr() + assert "test_label" in out + assert "Did not convert 1 documents" in out + assert "document_2" in out + + +def test_convert_documents_with_config_with_gcs_path_error(): + with pytest.raises(ValueError, match="gcs_prefix does not match accepted format"): + converter_helpers._convert_documents_with_config( + project_id="project-id", + location="location", + processor_id="project-id", + gcs_input_path="test-directory/1", + gcs_output_path="gs://test-directory/1/output", + ) + + +def test_convert_documents_with_config_with_file_error(): + with pytest.raises(ValueError, match="gcs_prefix cannot contain file types"): + converter_helpers._convert_documents_with_config( + project_id="project-id", + location="location", + processor_id="project-id", + gcs_input_path="gs://test-directory/1.json", + gcs_output_path="gs://test-directory/1/output", + ) diff --git a/tests/unit/test_document.py b/tests/unit/test_document.py index 9e0bd86b..d2e89e77 100644 --- a/tests/unit/test_document.py +++ b/tests/unit/test_document.py @@ -28,8 +28,6 @@ from google.cloud.documentai_toolbox import document from google.cloud import documentai -from google.cloud import storage - from google.cloud.vision import AnnotateFileResponse @@ -56,6 +54,13 @@ def get_bytes_multiple_files_mock(): yield byte_factory +@pytest.fixture +def get_bytes_unordered_files_mock(): + with mock.patch.object(document, "_get_bytes") as byte_factory: + byte_factory.return_value = get_bytes("tests/unit/resources/unordered_shards") + yield byte_factory + + @pytest.fixture def get_bytes_form_parser_mock(): with mock.patch.object(document, "_get_bytes") as byte_factory: @@ -70,23 +75,6 @@ def get_bytes_splitter_mock(): yield byte_factory -@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") -def test_get_bytes(mock_storage): - client = mock_storage.Client.return_value - mock_bucket = mock.Mock() - client.Bucket.return_value = mock_bucket - mock_blob1 = mock.Mock(name=[]) - mock_blob1.name.ends_with.return_value = True - mock_blob1.download_as_bytes.return_value = ( - "gs://test-directory/1/test-annotations.json" - ) - client.list_blobs.return_value = [mock_blob1] - - actual = document._get_bytes(gcs_bucket_name="test-directory", gcs_prefix="1") - - assert actual == ["gs://test-directory/1/test-annotations.json"] - - def test_get_shards_with_gcs_uri_contains_file_type(): with pytest.raises(ValueError, match="gcs_prefix cannot contain file types"): document._get_shards( @@ -113,6 +101,9 @@ def test_pages_from_shards(): actual = document._pages_from_shards(shards=shards) assert len(actual[0].paragraphs) == 31 + for page_index, page in enumerate(actual): + assert page.documentai_page.page_number == page_index + 1 + def test_entities_from_shard(): shards = [] @@ -163,148 +154,22 @@ def test_document_from_gcs_with_multiple_shards(get_bytes_multiple_files_mock): assert len(actual.pages) == 48 -@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") -def test_print_gcs_document_tree_with_one_folder(mock_storage, capfd): - client = mock_storage.Client.return_value - - mock_bucket = mock.Mock() - - client.Bucket.return_value = mock_bucket - - blobs = [ - storage.Blob( - name="gs://test-directory/1/test_shard1.json", - bucket="gs://test-directory/1", - ), - storage.Blob( - name="gs://test-directory/1/test_shard2.json", - bucket="gs://test-directory/1", - ), - storage.Blob( - name="gs://test-directory/1/test_shard3.json", - bucket="gs://test-directory/1", - ), - ] - - client.list_blobs.return_value = blobs - - document.print_gcs_document_tree(gcs_bucket_name="test-directory", gcs_prefix="/") - - mock_storage.Client.assert_called_once() - - out, err = capfd.readouterr() - assert ( - out - == """gs://test-directory/1 -├──test_shard1.json -├──test_shard2.json -└──test_shard3.json\n\n""" - ) - - -@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") -def test_print_gcs_document_tree_with_3_documents(mock_storage, capfd): - client = mock_storage.Client.return_value - - mock_bucket = mock.Mock() - - client.Bucket.return_value = mock_bucket - - blobs = [ - storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard1.json", - bucket="gs://test-directory/documentai/output/123456789/1", - ), - storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard2.json", - bucket="gs://test-directory/documentai/output/123456789/1", - ), - storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard3.json", - bucket="gs://test-directory/documentai/output/123456789/1", - ), - ] - - client.list_blobs.return_value = blobs - - document.print_gcs_document_tree( - gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/1/" - ) - - mock_storage.Client.assert_called_once() - - out, err = capfd.readouterr() - assert ( - out - == """gs://test-directory/documentai/output/123456789/1 -├──test_shard1.json -├──test_shard2.json -└──test_shard3.json\n\n""" - ) - - -@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") -def test_print_gcs_document_tree_with_more_than_5_document(mock_storage, capfd): - client = mock_storage.Client.return_value - - mock_bucket = mock.Mock() - - client.Bucket.return_value = mock_bucket - - blobs = [ - storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard1.json", - bucket="gs://test-directory/documentai/output/123456789/1", - ), - storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard2.json", - bucket="gs://test-directory/documentai/output/123456789/1", - ), - storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard3.json", - bucket="gs://test-directory/documentai/output/123456789/1", - ), - storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard4.json", - bucket="gs://test-directory/documentai/output/123456789/1", - ), - storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard5.json", - bucket="gs://test-directory/documentai/output/123456789/1", - ), - storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard6.json", - bucket="gs://test-directory/documentai/output/123456789/1", - ), - ] - client.list_blobs.return_value = blobs - - document.print_gcs_document_tree( - gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/1/" - ) - - mock_storage.Client.assert_called_once() - - out, err = capfd.readouterr() - assert ( - out - == """gs://test-directory/documentai/output/123456789/1 -├──test_shard1.json -├──test_shard2.json -├──test_shard3.json -├──test_shard4.json -├──test_shard5.json -│ .... -└──test_shard6.json\n\n""" +def test_document_from_gcs_with_unordered_shards(get_bytes_unordered_files_mock): + actual = document.Document.from_gcs( + gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/2/" ) + get_bytes_unordered_files_mock.assert_called_once() + expected_shard_count = len(actual.shards) + current_text_offset = 0 + for expected_shard_index, shard in enumerate(actual.shards): + assert int(shard.shard_info.shard_index) == expected_shard_index + assert int(shard.shard_info.shard_count) == expected_shard_count + assert int(shard.shard_info.text_offset) == current_text_offset + current_text_offset += len(shard.text) -def test_print_gcs_document_tree_with_gcs_uri_contains_file_type(): - with pytest.raises(ValueError, match="gcs_prefix cannot contain file types"): - document.print_gcs_document_tree( - gcs_bucket_name="test-directory", - gcs_prefix="documentai/output/123456789/1/test_file.json", - ) + for page_index, page in enumerate(actual.pages): + assert page.documentai_page.page_number == page_index + 1 def test_search_page_with_target_string(get_bytes_single_file_mock): @@ -371,6 +236,42 @@ def test_get_entity_by_type(get_bytes_single_file_mock): assert actual[0].mention_text == "222 Main Street\nAnytown, USA" +@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +def test_get_bytes(mock_storage): + client = mock_storage.Client.return_value + mock_bucket = mock.Mock() + client.Bucket.return_value = mock_bucket + + mock_ds_store = mock.Mock(name=[]) + mock_ds_store.name = "DS_Store" + + mock_blob1 = mock.Mock(name=[]) + mock_blob1.name = "gs://test-directory/1/test-annotations.json" + mock_blob1.download_as_bytes.return_value = ( + "gs://test-directory/1/test-annotations.json" + ) + + mock_blob2 = mock.Mock(name=[]) + mock_blob2.name = "gs://test-directory/1/test-config.json" + mock_blob2.download_as_bytes.return_value = "gs://test-directory/1/test-config.json" + + mock_blob3 = mock.Mock(name=[]) + mock_blob3.name = "gs://test-directory/1/test.pdf" + mock_blob3.download_as_bytes.return_value = "gs://test-directory/1/test.pdf" + + client.list_blobs.return_value = [mock_ds_store, mock_blob1, mock_blob2, mock_blob3] + + actual = document._get_bytes( + gcs_bucket_name="bucket", + gcs_prefix="prefix", + ) + + assert actual == [ + "gs://test-directory/1/test-annotations.json", + "gs://test-directory/1/test-config.json", + ] + + def test_get_form_field_by_name(get_bytes_form_parser_mock): doc = document.Document.from_gcs( gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0" diff --git a/tests/unit/test_utilities.py b/tests/unit/test_utilities.py new file mode 100644 index 00000000..f6d8747e --- /dev/null +++ b/tests/unit/test_utilities.py @@ -0,0 +1,413 @@ +# pylint: disable=protected-access +# -*- coding: utf-8 -*- +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from google.cloud import storage +from google.cloud.documentai_toolbox.utilities import utilities + +# try/except added for compatibility with python < 3.8 +try: + from unittest import mock +except ImportError: # pragma: NO COVER + import mock + + +test_bucket = "test-directory" +test_prefix = "documentai/input" + + +@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +def test_list_gcs_document_tree_with_one_folder(mock_storage, capfd): + client = mock_storage.Client.return_value + + mock_bucket = mock.Mock() + + client.Bucket.return_value = mock_bucket + + blobs = [ + storage.Blob( + name="gs://test-directory/1/test_shard1.json", + bucket="gs://test-directory/1", + ), + storage.Blob( + name="gs://test-directory/1/test_shard2.json", + bucket="gs://test-directory/1", + ), + storage.Blob( + name="gs://test-directory/1/test_shard3.json", + bucket="gs://test-directory/1", + ), + ] + + client.list_blobs.return_value = blobs + + doc_list = utilities.list_gcs_document_tree( + gcs_bucket_name="test-directory", gcs_prefix="/" + ) + + mock_storage.Client.assert_called_once() + + out, err = capfd.readouterr() + + assert "gs://test-directory/1" in list(doc_list.keys()) + + +@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +def test_list_gcs_document_tree_with_3_documents(mock_storage, capfd): + client = mock_storage.Client.return_value + + mock_bucket = mock.Mock() + + client.Bucket.return_value = mock_bucket + + blobs = [ + storage.Blob( + name="gs://test-directory/documentai/output/123456789/1/test_shard1.json", + bucket="gs://test-directory/documentai/output/123456789/1", + ), + storage.Blob( + name="gs://test-directory/documentai/output/123456789/1/test_shard2.json", + bucket="gs://test-directory/documentai/output/123456789/1", + ), + storage.Blob( + name="gs://test-directory/documentai/output/123456789/1/test_shard3.json", + bucket="gs://test-directory/documentai/output/123456789/1", + ), + ] + + client.list_blobs.return_value = blobs + + doc_list = utilities.list_gcs_document_tree( + gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/1/" + ) + + mock_storage.Client.assert_called_once() + + out, err = capfd.readouterr() + + assert "gs://test-directory/documentai/output/123456789/1" in list(doc_list.keys()) + + +@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +def test_list_gcs_document_tree_with_more_than_5_document(mock_storage, capfd): + client = mock_storage.Client.return_value + + mock_bucket = mock.Mock() + + client.Bucket.return_value = mock_bucket + + blobs = [ + storage.Blob( + name="gs://test-directory/documentai/output/123456789/1/test_shard1.json", + bucket="gs://test-directory/documentai/output/123456789/1", + ), + storage.Blob( + name="gs://test-directory/documentai/output/123456789/1/test_shard2.json", + bucket="gs://test-directory/documentai/output/123456789/1", + ), + storage.Blob( + name="gs://test-directory/documentai/output/123456789/1/test_shard3.json", + bucket="gs://test-directory/documentai/output/123456789/1", + ), + storage.Blob( + name="gs://test-directory/documentai/output/123456789/1/test_shard4.json", + bucket="gs://test-directory/documentai/output/123456789/1", + ), + storage.Blob( + name="gs://test-directory/documentai/output/123456789/1/test_shard5.json", + bucket="gs://test-directory/documentai/output/123456789/1", + ), + storage.Blob( + name="gs://test-directory/documentai/output/123456789/1/test_shard6.json", + bucket="gs://test-directory/documentai/output/123456789/1", + ), + ] + client.list_blobs.return_value = blobs + + doc_list = utilities.list_gcs_document_tree( + gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/1/" + ) + + mock_storage.Client.assert_called_once() + + out, err = capfd.readouterr() + + assert "gs://test-directory/documentai/output/123456789/1" in list(doc_list.keys()) + + +def test_list_gcs_document_tree_with_gcs_uri_contains_file_type(): + with pytest.raises(ValueError, match="gcs_prefix cannot contain file types"): + utilities.list_gcs_document_tree( + gcs_bucket_name="test-directory", + gcs_prefix="documentai/output/123456789/1/test_file.json", + ) + + +@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +def test_print_gcs_document_tree_with_one_folder(mock_storage, capfd): + client = mock_storage.Client.return_value + + mock_bucket = mock.Mock() + + client.Bucket.return_value = mock_bucket + + blobs = [ + storage.Blob( + name="gs://test-directory/1/test_shard1.json", + bucket="gs://test-directory/1", + ), + storage.Blob( + name="gs://test-directory/1/test_shard2.json", + bucket="gs://test-directory/1", + ), + storage.Blob( + name="gs://test-directory/1/test_shard3.json", + bucket="gs://test-directory/1", + ), + ] + + client.list_blobs.return_value = blobs + + utilities.print_gcs_document_tree(gcs_bucket_name="test-directory", gcs_prefix="/") + + mock_storage.Client.assert_called_once() + + out, err = capfd.readouterr() + assert ( + out + == """gs://test-directory/1 +├──test_shard1.json +├──test_shard2.json +└──test_shard3.json\n\n""" + ) + + +@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +def test_print_gcs_document_tree_with_3_documents(mock_storage, capfd): + client = mock_storage.Client.return_value + + mock_bucket = mock.Mock() + + client.Bucket.return_value = mock_bucket + + blobs = [ + storage.Blob( + name="gs://test-directory/documentai/output/123456789/1/test_shard1.json", + bucket="gs://test-directory/documentai/output/123456789/1", + ), + storage.Blob( + name="gs://test-directory/documentai/output/123456789/1/test_shard2.json", + bucket="gs://test-directory/documentai/output/123456789/1", + ), + storage.Blob( + name="gs://test-directory/documentai/output/123456789/1/test_shard3.json", + bucket="gs://test-directory/documentai/output/123456789/1", + ), + ] + + client.list_blobs.return_value = blobs + + utilities.print_gcs_document_tree( + gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/1/" + ) + + mock_storage.Client.assert_called_once() + + out, err = capfd.readouterr() + assert ( + out + == """gs://test-directory/documentai/output/123456789/1 +├──test_shard1.json +├──test_shard2.json +└──test_shard3.json\n\n""" + ) + + +@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +def test_print_gcs_document_tree_with_more_than_5_document(mock_storage, capfd): + client = mock_storage.Client.return_value + + mock_bucket = mock.Mock() + + client.Bucket.return_value = mock_bucket + + blobs = [ + storage.Blob( + name="gs://test-directory/documentai/output/123456789/1/test_shard1.json", + bucket="gs://test-directory/documentai/output/123456789/1", + ), + storage.Blob( + name="gs://test-directory/documentai/output/123456789/1/test_shard2.json", + bucket="gs://test-directory/documentai/output/123456789/1", + ), + storage.Blob( + name="gs://test-directory/documentai/output/123456789/1/test_shard3.json", + bucket="gs://test-directory/documentai/output/123456789/1", + ), + storage.Blob( + name="gs://test-directory/documentai/output/123456789/1/test_shard4.json", + bucket="gs://test-directory/documentai/output/123456789/1", + ), + storage.Blob( + name="gs://test-directory/documentai/output/123456789/1/test_shard5.json", + bucket="gs://test-directory/documentai/output/123456789/1", + ), + storage.Blob( + name="gs://test-directory/documentai/output/123456789/1/test_shard6.json", + bucket="gs://test-directory/documentai/output/123456789/1", + ), + ] + client.list_blobs.return_value = blobs + + utilities.print_gcs_document_tree( + gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/1/" + ) + + mock_storage.Client.assert_called_once() + + out, err = capfd.readouterr() + assert ( + out + == """gs://test-directory/documentai/output/123456789/1 +├──test_shard1.json +├──test_shard2.json +├──test_shard3.json +├──test_shard4.json +├──test_shard5.json +│ .... +└──test_shard6.json\n\n""" + ) + + +def test_print_gcs_document_tree_with_gcs_uri_contains_file_type(): + with pytest.raises(ValueError, match="gcs_prefix cannot contain file types"): + utilities.print_gcs_document_tree( + gcs_bucket_name="test-directory", + gcs_prefix="documentai/output/123456789/1/test_file.json", + ) + + +@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +def test_create_batches_with_3_documents(mock_storage, capfd): + client = mock_storage.Client.return_value + mock_bucket = mock.Mock() + client.Bucket.return_value = mock_bucket + + mock_blobs = [] + for i in range(3): + mock_blob = mock.Mock( + name=f"test_file{i}.pdf", content_type="application/pdf", size=1024 + ) + mock_blob.name.endswith.return_value = False + mock_blobs.append(mock_blob) + client.list_blobs.return_value = mock_blobs + + actual = utilities.create_batches( + gcs_bucket_name=test_bucket, gcs_prefix=test_prefix + ) + + mock_storage.Client.assert_called_once() + + out, err = capfd.readouterr() + assert out == "" + assert len(actual) == 1 + assert len(actual[0].gcs_documents.documents) == 3 + + +def test_create_batches_with_invalid_batch_size(capfd): + with pytest.raises(ValueError): + utilities.create_batches( + gcs_bucket_name=test_bucket, gcs_prefix=test_prefix, batch_size=51 + ) + + out, err = capfd.readouterr() + assert "Batch size must be less than" in out + assert err + + +@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +def test_create_batches_with_large_folder(mock_storage, capfd): + client = mock_storage.Client.return_value + mock_bucket = mock.Mock() + client.Bucket.return_value = mock_bucket + + mock_blobs = [] + for i in range(96): + mock_blob = mock.Mock( + name=f"test_file{i}.pdf", content_type="application/pdf", size=1024 + ) + mock_blob.name.endswith.return_value = False + mock_blobs.append(mock_blob) + client.list_blobs.return_value = mock_blobs + + actual = utilities.create_batches( + gcs_bucket_name=test_bucket, gcs_prefix=test_prefix + ) + + mock_storage.Client.assert_called_once() + + out, err = capfd.readouterr() + assert out == "" + assert len(actual) == 2 + assert len(actual[0].gcs_documents.documents) == 50 + assert len(actual[1].gcs_documents.documents) == 46 + + +@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +def test_create_batches_with_invalid_file_type(mock_storage, capfd): + client = mock_storage.Client.return_value + mock_bucket = mock.Mock() + client.Bucket.return_value = mock_bucket + + mock_blob = mock.Mock( + name="test_file.json", content_type="application/json", size=1024 + ) + mock_blob.name.endswith.return_value = False + client.list_blobs.return_value = [mock_blob] + + actual = utilities.create_batches( + gcs_bucket_name=test_bucket, gcs_prefix=test_prefix + ) + + mock_storage.Client.assert_called_once() + + out, err = capfd.readouterr() + assert "Invalid Mime Type" in out + assert actual == [] + + +@mock.patch("google.cloud.documentai_toolbox.wrappers.document.storage") +def test_create_batches_with_large_file(mock_storage, capfd): + client = mock_storage.Client.return_value + mock_bucket = mock.Mock() + client.Bucket.return_value = mock_bucket + + mock_blob = mock.Mock( + name="test_file.pdf", content_type="application/pdf", size=2073741824 + ) + mock_blob.name.endswith.return_value = False + client.list_blobs.return_value = [mock_blob] + + actual = utilities.create_batches( + gcs_bucket_name=test_bucket, gcs_prefix=test_prefix + ) + + mock_storage.Client.assert_called_once() + + out, err = capfd.readouterr() + assert "File size must be less than" in out + assert actual == []