diff --git a/CHANGELOG.md b/CHANGELOG.md index 69f4fb66..39d13a70 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Changelog +## [0.10.3-alpha](https://github.com/googleapis/python-documentai-toolbox/compare/v0.10.2-alpha...v0.10.3-alpha) (2023-10-06) + + +### Bug Fixes + +* `docai_utilities.py` to return `Optional` ([#176](https://github.com/googleapis/python-documentai-toolbox/issues/176)) ([028bc37](https://github.com/googleapis/python-documentai-toolbox/commit/028bc37b8a488cecf7a3a71d90036594bfa0dc23)) + ## [0.10.2-alpha](https://github.com/googleapis/python-documentai-toolbox/compare/v0.10.1-alpha...v0.10.2-alpha) (2023-10-03) diff --git a/google/cloud/documentai_toolbox/utilities/docai_utilities.py b/google/cloud/documentai_toolbox/utilities/docai_utilities.py index 97fcf748..b86efa42 100644 --- a/google/cloud/documentai_toolbox/utilities/docai_utilities.py +++ b/google/cloud/documentai_toolbox/utilities/docai_utilities.py @@ -15,7 +15,7 @@ # """Utilities for Document AI""" -from typing import Tuple +from typing import Optional, Tuple from google.cloud import documentai @@ -23,7 +23,7 @@ def get_bounding_box( bounding_poly: documentai.BoundingPoly, page_dimension: documentai.Document.Page.Dimension, -) -> Tuple[int, int, int, int]: +) -> Optional[Tuple[int, int, int, int]]: r"""Returns the bounding box of an element from the element bounding_poly and page dimensions. Args: @@ -35,10 +35,10 @@ def get_bounding_box( Returns: Tuple[int, int, int, int]: Bounding box coordinates in order (top, left, bottom, right). - Returns `0, 0, 0, 0` if `bounding_poly.normalized_vertices` is empty. + Returns `None` if `bounding_poly` or `bounding_poly.normalized_vertices` is empty. """ - if not bounding_poly.normalized_vertices: - return 0, 0, 0, 0 + if not bounding_poly or not bounding_poly.normalized_vertices: + return None vertices = [ ( diff --git a/google/cloud/documentai_toolbox/version.py b/google/cloud/documentai_toolbox/version.py index 88a3fd38..068ade97 100644 --- a/google/cloud/documentai_toolbox/version.py +++ b/google/cloud/documentai_toolbox/version.py @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. # -__version__ = "0.10.2-alpha" +__version__ = "0.10.3-alpha" diff --git a/google/cloud/documentai_toolbox/wrappers/entity.py b/google/cloud/documentai_toolbox/wrappers/entity.py index 15ef150c..2ccb6530 100644 --- a/google/cloud/documentai_toolbox/wrappers/entity.py +++ b/google/cloud/documentai_toolbox/wrappers/entity.py @@ -95,9 +95,11 @@ def crop_image( if not documentai_page.image: raise ValueError("Document does not contain images.") - top, left, bottom, right = docai_utilities.get_bounding_box( + bbox = docai_utilities.get_bounding_box( bounding_poly=self.documentai_object.page_anchor.page_refs[0].bounding_poly, page_dimension=documentai_page.dimension, ) + if bbox is None: + return None doc_image = Image.open(BytesIO(documentai_page.image.content)) - return doc_image.crop((top, left, bottom, right)) + return doc_image.crop(bbox) diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py index ca522894..7ba82af1 100644 --- a/google/cloud/documentai_toolbox/wrappers/page.py +++ b/google/cloud/documentai_toolbox/wrappers/page.py @@ -16,7 +16,7 @@ """Wrappers for Document AI Page type.""" import dataclasses -from typing import List, Optional, Union, cast +from typing import cast, List, Optional, Union import pandas as pd @@ -301,7 +301,7 @@ def _table_rows_from_documentai_table_rows( def _get_hocr_bounding_box( element_with_layout: ElementWithLayout, page_dimension: documentai.Document.Page.Dimension, -) -> str: +) -> Optional[str]: r"""Returns a hOCR bounding box string. Args: @@ -311,13 +311,21 @@ def _get_hocr_bounding_box( Required. Page dimension. Returns: - str: + Optional[str]: hOCR bounding box sring. """ - min_x, min_y, max_x, max_y = docai_utilities.get_bounding_box( + if not element_with_layout.layout.bounding_poly: + return None + + bbox = docai_utilities.get_bounding_box( bounding_poly=element_with_layout.layout.bounding_poly, page_dimension=page_dimension, ) + + if not bbox: + return None + + min_x, min_y, max_x, max_y = bbox return f"bbox {min_x} {min_y} {max_x} {max_y}" diff --git a/samples/snippets/requirements-test.txt b/samples/snippets/requirements-test.txt index 07a18a85..331425b6 100644 --- a/samples/snippets/requirements-test.txt +++ b/samples/snippets/requirements-test.txt @@ -1,3 +1,3 @@ pytest==7.4.2 mock==5.1.0 -google-cloud-bigquery==3.11.4 +google-cloud-bigquery==3.12.0 diff --git a/samples/snippets/requirements.txt b/samples/snippets/requirements.txt index 6c55b6bf..047cfb35 100644 --- a/samples/snippets/requirements.txt +++ b/samples/snippets/requirements.txt @@ -1,4 +1,4 @@ -google-cloud-bigquery==3.11.4 +google-cloud-bigquery==3.12.0 google-cloud-documentai==2.20.0 google-cloud-storage==2.11.0 -google-cloud-documentai-toolbox==0.9.0a0 +google-cloud-documentai-toolbox==0.10.2a0 diff --git a/setup.py b/setup.py index 0da7c1b7..7a29932e 100644 --- a/setup.py +++ b/setup.py @@ -53,18 +53,19 @@ "tabulate >= 0.9.0, <1.0.0", "proto-plus >= 1.22.0, <2.0.0dev", "proto-plus >= 1.22.2, <2.0.0dev; python_version>='3.11'", - "grpc-google-iam-v1 >= 0.12.4, < 0.13dev", + "grpc-google-iam-v1 >= 0.12.6, < 0.13dev", "google-cloud-bigquery >= 3.5.0, < 4.0.0dev", "google-cloud-documentai >= 2.20.0, < 3.0.0dev", "google-cloud-storage >= 1.31.0, < 3.0.0dev", - "google-cloud-vision >= 2.7.0, < 4.0.0dev ", - "numpy >= 1.18.1", + "google-cloud-vision >= 2.7.0, < 4.0.0dev", + "numpy >= 1.18.1, < 2.0.0", "intervaltree >= 3.0.0", - "pikepdf >= 6.2.9, < 8.0.0", + "pikepdf >= 6.2.9, < 9.0.0", "pikepdf >= 6.2.9, < 7.0.0; python_version<'3.8'", - "immutabledict >= 2.0.0, < 3.0.0dev", - "Pillow >= 9.5.0, < 10.0.0", - "Jinja2 >= 3.1.0, <= 3.1.2", + "immutabledict >= 2.0.0, < 4.0.0", + "immutabledict >= 2.0.0, < 3.0.0dev; python_version<'3.8'", + "Pillow >= 9.5.0, < 11.0.0", + "Jinja2 >= 3.1.0, <= 4.0.0", ), python_requires=">=3.7", classifiers=[ @@ -75,6 +76,7 @@ "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", "Topic :: Internet", "Topic :: Software Development :: Libraries :: Python Modules", ], diff --git a/testing/constraints-3.7.txt b/testing/constraints-3.7.txt index 013a679d..3c64ab2e 100644 --- a/testing/constraints-3.7.txt +++ b/testing/constraints-3.7.txt @@ -8,9 +8,9 @@ google-api-core==1.34.0 libcst==0.2.5 pandas==1.0.0 proto-plus==1.22.0 -grpc-google-iam-v1==0.12.4 +grpc-google-iam-v1==0.12.6 google-cloud-bigquery==3.5.0 google-cloud-documentai==2.20.0 google-cloud-storage==2.7.0 -numpy==1.18.1 +numpy==1.19.5 pikepdf==6.2.9 diff --git a/testing/constraints-3.8.txt b/testing/constraints-3.8.txt index c9f0e4bb..ed1905e2 100644 --- a/testing/constraints-3.8.txt +++ b/testing/constraints-3.8.txt @@ -9,5 +9,5 @@ grpc-google-iam-v1 google-cloud-bigquery google-cloud-documentai google-cloud-storage -numpy -pikepdf +numpy==1.21.6 +pikepdf==8.2.3 diff --git a/tests/unit/test_document.py b/tests/unit/test_document.py index be9ae78c..d988e1e0 100644 --- a/tests/unit/test_document.py +++ b/tests/unit/test_document.py @@ -105,6 +105,19 @@ def get_bytes_missing_shard_mock(): yield byte_factory +def create_document_with_images_without_bbox(get_bytes_images_mock): + doc = document.Document.from_gcs( + gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0" + ) + + del ( + doc.entities[0] + .documentai_object.page_anchor.page_refs[0] + .bounding_poly.normalized_vertices + ) + return doc + + def test_get_shards_with_gcs_uri_contains_file_type(): with pytest.raises(ValueError, match="gcs_prefix cannot contain file types"): document._get_shards( @@ -299,6 +312,13 @@ def test_document_from_document_path_with_single_shard(): assert len(actual.pages) == 1 +def test_document_from_document_path_with_directory(): + actual = document.Document.from_document_path( + document_path="tests/unit/resources/0/" + ) + assert len(actual.pages) == 1 + + def test_document_from_documentai_document_with_single_shard(): with open( "tests/unit/resources/0/toolbox_invoice_test-0.json", "r", encoding="utf-8" @@ -626,6 +646,7 @@ def test_export_images(get_bytes_images_mock): output_path = "resources/output/" if os.path.exists(output_path): shutil.rmtree(output_path) + assert not os.path.exists(output_path) doc = document.Document.from_gcs( gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0" @@ -648,6 +669,20 @@ def test_export_images(get_bytes_images_mock): shutil.rmtree(output_path) +def test_export_images_empty_bounding_box(get_bytes_images_mock): + output_path = "resources/output/" + + doc = create_document_with_images_without_bbox(get_bytes_images_mock) + actual = doc.export_images( + output_path=output_path, + output_file_prefix="exported_photo", + output_file_extension="png", + ) + get_bytes_images_mock.assert_called_once() + + assert not actual + + def test_export_hocr_str(): wrapped_document = document.Document.from_document_path( document_path="tests/unit/resources/0/toolbox_invoice_test-0.json" diff --git a/tests/unit/test_entity.py b/tests/unit/test_entity.py index df52905d..c382d4e8 100644 --- a/tests/unit/test_entity.py +++ b/tests/unit/test_entity.py @@ -102,3 +102,15 @@ def test_crop_image_without_page_image(docproto): match="Document does not contain images.", ): doc.entities[0].crop_image(documentai_page=docproto.pages[0]) + + +def test_crop_image_empty_bounding_box(docproto): + doc = document.Document.from_documentai_document(docproto) + del ( + doc.entities[0] + .documentai_object.page_anchor.page_refs[0] + .bounding_poly.normalized_vertices + ) + + actual = doc.entities[0].crop_image(documentai_page=docproto.pages[0]) + assert actual is None diff --git a/tests/unit/test_page.py b/tests/unit/test_page.py index 0f04c1a0..38025b45 100644 --- a/tests/unit/test_page.py +++ b/tests/unit/test_page.py @@ -40,6 +40,12 @@ def docproto_form_parser(): return documentai.Document.from_json(f.read()) +@pytest.fixture +def docproto_blank_document(): + with open("tests/unit/resources/blank_document.json", "r", encoding="utf-8") as f: + return documentai.Document.from_json(f.read()) + + def test_table_to_csv(docproto): docproto_page = docproto.pages[0] table = page.Table( @@ -160,6 +166,15 @@ def test_get_hocr_bounding_box(docproto): assert hocr_bounding_box_with_vertices == "bbox 1310 220 1534 282" +def test_get_hocr_bounding_box_with_blank_document(docproto_blank_document): + hocr_bounding_box_normalized = page._get_hocr_bounding_box( + element_with_layout=docproto_blank_document.pages[0], + page_dimension=docproto_blank_document.pages[0].dimension, + ) + + assert hocr_bounding_box_normalized is None + + # Class init Tests