diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml index 51213ca0..e4e943e0 100644 --- a/.github/.OwlBot.lock.yaml +++ b/.github/.OwlBot.lock.yaml @@ -13,5 +13,5 @@ # limitations under the License. docker: image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest - digest: sha256:a0c4463fcfd9893fc172a3b3db2b6ac0c7b94ec6ad458c7dcea12d9693615ac3 -# created: 2024-02-17T12:21:23.177926195Z + digest: sha256:98f3afd11308259de6e828e37376d18867fd321aba07826e29e4f8d9cab56bad +# created: 2024-02-27T15:56:18.442440378Z diff --git a/.kokoro/requirements.txt b/.kokoro/requirements.txt index f80bdcd6..bda8e38c 100644 --- a/.kokoro/requirements.txt +++ b/.kokoro/requirements.txt @@ -93,39 +93,39 @@ colorlog==6.7.0 \ # via # gcp-docuploader # nox -cryptography==42.0.2 \ - --hash=sha256:087887e55e0b9c8724cf05361357875adb5c20dec27e5816b653492980d20380 \ - --hash=sha256:09a77e5b2e8ca732a19a90c5bca2d124621a1edb5438c5daa2d2738bfeb02589 \ - --hash=sha256:130c0f77022b2b9c99d8cebcdd834d81705f61c68e91ddd614ce74c657f8b3ea \ - --hash=sha256:141e2aa5ba100d3788c0ad7919b288f89d1fe015878b9659b307c9ef867d3a65 \ - --hash=sha256:28cb2c41f131a5758d6ba6a0504150d644054fd9f3203a1e8e8d7ac3aea7f73a \ - --hash=sha256:2f9f14185962e6a04ab32d1abe34eae8a9001569ee4edb64d2304bf0d65c53f3 \ - --hash=sha256:320948ab49883557a256eab46149df79435a22d2fefd6a66fe6946f1b9d9d008 \ - --hash=sha256:36d4b7c4be6411f58f60d9ce555a73df8406d484ba12a63549c88bd64f7967f1 \ - --hash=sha256:3b15c678f27d66d247132cbf13df2f75255627bcc9b6a570f7d2fd08e8c081d2 \ - --hash=sha256:3dbd37e14ce795b4af61b89b037d4bc157f2cb23e676fa16932185a04dfbf635 \ - --hash=sha256:4383b47f45b14459cab66048d384614019965ba6c1a1a141f11b5a551cace1b2 \ - --hash=sha256:44c95c0e96b3cb628e8452ec060413a49002a247b2b9938989e23a2c8291fc90 \ - --hash=sha256:4b063d3413f853e056161eb0c7724822a9740ad3caa24b8424d776cebf98e7ee \ - --hash=sha256:52ed9ebf8ac602385126c9a2fe951db36f2cb0c2538d22971487f89d0de4065a \ - --hash=sha256:55d1580e2d7e17f45d19d3b12098e352f3a37fe86d380bf45846ef257054b242 \ - --hash=sha256:5ef9bc3d046ce83c4bbf4c25e1e0547b9c441c01d30922d812e887dc5f125c12 \ - --hash=sha256:5fa82a26f92871eca593b53359c12ad7949772462f887c35edaf36f87953c0e2 \ - --hash=sha256:61321672b3ac7aade25c40449ccedbc6db72c7f5f0fdf34def5e2f8b51ca530d \ - --hash=sha256:701171f825dcab90969596ce2af253143b93b08f1a716d4b2a9d2db5084ef7be \ - --hash=sha256:841ec8af7a8491ac76ec5a9522226e287187a3107e12b7d686ad354bb78facee \ - --hash=sha256:8a06641fb07d4e8f6c7dda4fc3f8871d327803ab6542e33831c7ccfdcb4d0ad6 \ - --hash=sha256:8e88bb9eafbf6a4014d55fb222e7360eef53e613215085e65a13290577394529 \ - --hash=sha256:a00aee5d1b6c20620161984f8ab2ab69134466c51f58c052c11b076715e72929 \ - --hash=sha256:a047682d324ba56e61b7ea7c7299d51e61fd3bca7dad2ccc39b72bd0118d60a1 \ - --hash=sha256:a7ef8dd0bf2e1d0a27042b231a3baac6883cdd5557036f5e8df7139255feaac6 \ - --hash=sha256:ad28cff53f60d99a928dfcf1e861e0b2ceb2bc1f08a074fdd601b314e1cc9e0a \ - --hash=sha256:b9097a208875fc7bbeb1286d0125d90bdfed961f61f214d3f5be62cd4ed8a446 \ - --hash=sha256:b97fe7d7991c25e6a31e5d5e795986b18fbbb3107b873d5f3ae6dc9a103278e9 \ - --hash=sha256:e0ec52ba3c7f1b7d813cd52649a5b3ef1fc0d433219dc8c93827c57eab6cf888 \ - --hash=sha256:ea2c3ffb662fec8bbbfce5602e2c159ff097a4631d96235fcf0fb00e59e3ece4 \ - --hash=sha256:fa3dec4ba8fb6e662770b74f62f1a0c7d4e37e25b58b2bf2c1be4c95372b4a33 \ - --hash=sha256:fbeb725c9dc799a574518109336acccaf1303c30d45c075c665c0793c2f79a7f +cryptography==42.0.4 \ + --hash=sha256:01911714117642a3f1792c7f376db572aadadbafcd8d75bb527166009c9f1d1b \ + --hash=sha256:0e89f7b84f421c56e7ff69f11c441ebda73b8a8e6488d322ef71746224c20fce \ + --hash=sha256:12d341bd42cdb7d4937b0cabbdf2a94f949413ac4504904d0cdbdce4a22cbf88 \ + --hash=sha256:15a1fb843c48b4a604663fa30af60818cd28f895572386e5f9b8a665874c26e7 \ + --hash=sha256:1cdcdbd117681c88d717437ada72bdd5be9de117f96e3f4d50dab3f59fd9ab20 \ + --hash=sha256:1df6fcbf60560d2113b5ed90f072dc0b108d64750d4cbd46a21ec882c7aefce9 \ + --hash=sha256:3c6048f217533d89f2f8f4f0fe3044bf0b2090453b7b73d0b77db47b80af8dff \ + --hash=sha256:3e970a2119507d0b104f0a8e281521ad28fc26f2820687b3436b8c9a5fcf20d1 \ + --hash=sha256:44a64043f743485925d3bcac548d05df0f9bb445c5fcca6681889c7c3ab12764 \ + --hash=sha256:4e36685cb634af55e0677d435d425043967ac2f3790ec652b2b88ad03b85c27b \ + --hash=sha256:5f8907fcf57392cd917892ae83708761c6ff3c37a8e835d7246ff0ad251d9298 \ + --hash=sha256:69b22ab6506a3fe483d67d1ed878e1602bdd5912a134e6202c1ec672233241c1 \ + --hash=sha256:6bfadd884e7280df24d26f2186e4e07556a05d37393b0f220a840b083dc6a824 \ + --hash=sha256:6d0fbe73728c44ca3a241eff9aefe6496ab2656d6e7a4ea2459865f2e8613257 \ + --hash=sha256:6ffb03d419edcab93b4b19c22ee80c007fb2d708429cecebf1dd3258956a563a \ + --hash=sha256:810bcf151caefc03e51a3d61e53335cd5c7316c0a105cc695f0959f2c638b129 \ + --hash=sha256:831a4b37accef30cccd34fcb916a5d7b5be3cbbe27268a02832c3e450aea39cb \ + --hash=sha256:887623fe0d70f48ab3f5e4dbf234986b1329a64c066d719432d0698522749929 \ + --hash=sha256:a0298bdc6e98ca21382afe914c642620370ce0470a01e1bef6dd9b5354c36854 \ + --hash=sha256:a1327f280c824ff7885bdeef8578f74690e9079267c1c8bd7dc5cc5aa065ae52 \ + --hash=sha256:c1f25b252d2c87088abc8bbc4f1ecbf7c919e05508a7e8628e6875c40bc70923 \ + --hash=sha256:c3a5cbc620e1e17009f30dd34cb0d85c987afd21c41a74352d1719be33380885 \ + --hash=sha256:ce8613beaffc7c14f091497346ef117c1798c202b01153a8cc7b8e2ebaaf41c0 \ + --hash=sha256:d2a27aca5597c8a71abbe10209184e1a8e91c1fd470b5070a2ea60cafec35bcd \ + --hash=sha256:dad9c385ba8ee025bb0d856714f71d7840020fe176ae0229de618f14dae7a6e2 \ + --hash=sha256:db4b65b02f59035037fde0998974d84244a64c3265bdef32a827ab9b63d61b18 \ + --hash=sha256:e09469a2cec88fb7b078e16d4adec594414397e8879a4341c6ace96013463d5b \ + --hash=sha256:e53dc41cda40b248ebc40b83b31516487f7db95ab8ceac1f042626bc43a2f992 \ + --hash=sha256:f1e85a178384bf19e36779d91ff35c7617c885da487d689b05c1366f9933ad74 \ + --hash=sha256:f47be41843200f7faec0683ad751e5ef11b9a56a220d57f300376cd8aba81660 \ + --hash=sha256:fb0cef872d8193e487fc6bdb08559c3aa41b659a7d9be48b2e10747f47863925 \ + --hash=sha256:ffc73996c4fca3d2b6c1c8c12bfd3ad00def8621da24f547626bf06441400449 # via # gcp-releasetool # secretstorage diff --git a/CHANGELOG.md b/CHANGELOG.md index 0337823d..35504bb0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Changelog +## [0.13.1-alpha](https://github.com/googleapis/python-documentai-toolbox/compare/v0.13.0-alpha...v0.13.1-alpha) (2024-03-04) + + +### Bug Fixes + +* Changed `client_info` import and added new quickstart samples ([#268](https://github.com/googleapis/python-documentai-toolbox/issues/268)) ([c4b1d58](https://github.com/googleapis/python-documentai-toolbox/commit/c4b1d58aaf4cedd2a08b9445220e44b906151e6a)), closes [#266](https://github.com/googleapis/python-documentai-toolbox/issues/266) + ## [0.13.0-alpha](https://github.com/googleapis/python-documentai-toolbox/compare/v0.12.2-alpha...v0.13.0-alpha) (2024-02-26) diff --git a/google/cloud/documentai_toolbox/utilities/gcs_utilities.py b/google/cloud/documentai_toolbox/utilities/gcs_utilities.py index 0c5ff3e2..077ca19a 100644 --- a/google/cloud/documentai_toolbox/utilities/gcs_utilities.py +++ b/google/cloud/documentai_toolbox/utilities/gcs_utilities.py @@ -18,7 +18,7 @@ import re from typing import Dict, List, Optional, Tuple -from google.api_core import client_info +from google.api_core.gapic_v1 import client_info from google.cloud import documentai, documentai_toolbox, storage from google.cloud.documentai_toolbox import constants diff --git a/google/cloud/documentai_toolbox/version.py b/google/cloud/documentai_toolbox/version.py index 8ed9c592..05db1d43 100644 --- a/google/cloud/documentai_toolbox/version.py +++ b/google/cloud/documentai_toolbox/version.py @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. # -__version__ = "0.13.0-alpha" +__version__ = "0.13.1-alpha" diff --git a/google/cloud/documentai_toolbox/wrappers/document.py b/google/cloud/documentai_toolbox/wrappers/document.py index 6a49ed49..7818a2fa 100644 --- a/google/cloud/documentai_toolbox/wrappers/document.py +++ b/google/cloud/documentai_toolbox/wrappers/document.py @@ -22,6 +22,7 @@ import re from typing import Dict, List, Optional, Type, Union +from google.api_core.client_options import ClientOptions from google.api_core.operation import from_gapic as operation_from_gapic from google.cloud.vision import AnnotateFileResponse from google.longrunning.operations_pb2 import GetOperationRequest @@ -138,6 +139,7 @@ def _get_shards(gcs_bucket_name: str, gcs_prefix: str) -> List[documentai.Docume def _get_batch_process_metadata( operation_name: str, + location: Optional[str] = None, timeout: Optional[float] = None, ) -> documentai.BatchProcessMetadata: r"""Get `BatchProcessMetadata` from a `batch_process_documents()` long-running operation. @@ -146,6 +148,10 @@ def _get_batch_process_metadata( operation_name (str): Required. The fully qualified operation name for a `batch_process_documents()` operation. + location (str): + Optional. The location of the processor used for `batch_process_documents()`. + Deprecated. Maintained for backwards compatibility. + timeout (float): Optional. Default None. Time in seconds to wait for operation to complete. If None, will wait indefinitely. @@ -153,15 +159,30 @@ def _get_batch_process_metadata( documentai.BatchProcessMetadata: Metadata from batch process. """ + # Validate Operation Name + match = re.search( + r"projects\/\w+\/locations\/(\w+)\/operations\/\w+", operation_name + ) + + if not match: + raise ValueError( + f"Invalid Operation Name: {operation_name}\n" + "Expected operation name in the format `projects//locations//operations/`" + ) + + location = location or match.group(1) + client = documentai.DocumentProcessorServiceClient( client_info=gcs_utilities._get_client_info(module="get_batch_process_metadata"), + client_options=ClientOptions( + api_endpoint=f"{location}-documentai.googleapis.com" + ), ) # Poll Operation until complete. operation = operation_from_gapic( operation=client.get_operation( request=GetOperationRequest(name=operation_name), - metadata=documentai.BatchProcessMetadata(), ), operations_client=client, result_type=documentai.BatchProcessResponse, @@ -599,6 +620,7 @@ def from_batch_process_operation( return cls.from_batch_process_metadata( metadata=_get_batch_process_metadata( operation_name=operation_name, + location=location, timeout=timeout, ) ) diff --git a/samples/snippets/quickstart_sample.py b/samples/snippets/quickstart_sample.py index f0d3a998..0cea14db 100644 --- a/samples/snippets/quickstart_sample.py +++ b/samples/snippets/quickstart_sample.py @@ -15,41 +15,88 @@ # [START documentai_toolbox_quickstart] +from typing import Optional +from google.cloud import documentai from google.cloud.documentai_toolbox import document from google.cloud.documentai_toolbox import gcs_utilities # TODO(developer): Uncomment these variables before running the sample. -# Given a document.proto or sharded document.proto in path gs://bucket/path/to/folder +# Given a Document JSON or sharded Document JSON in path gs://bucket/path/to/folder # gcs_bucket_name = "bucket" # gcs_prefix = "path/to/folder" +# Or, given a Document JSON in path gs://bucket/path/to/folder/document.json +# gcs_uri = "gs://bucket/path/to/folder/document.json" + +# Or, given a Document JSON in path local/path/to/folder/document.json +# document_path = "local/path/to/folder/document.json" + +# Or, given a Document object from Document AI +# documentai_document = documentai.Document() + +# Or, given a BatchProcessMetadata object from Document AI +# operation = client.batch_process_documents(request) +# operation.result(timeout=timeout) +# batch_process_metadata = documentai.BatchProcessMetadata(operation.metadata) + +# Or, given a BatchProcessOperation name from Document AI +# batch_process_operation = "projects/project_id/locations/location/operations/operation_id" + + +def quickstart_sample( + gcs_bucket_name: Optional[str] = None, + gcs_prefix: Optional[str] = None, + gcs_uri: Optional[str] = None, + document_path: Optional[str] = None, + documentai_document: Optional[documentai.Document] = None, + batch_process_metadata: Optional[documentai.BatchProcessMetadata] = None, + batch_process_operation: Optional[str] = None, +) -> None: + if gcs_bucket_name and gcs_prefix: + # Load from Google Cloud Storage Directory + print("Document structure in Cloud Storage") + gcs_utilities.print_gcs_document_tree( + gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix + ) + + wrapped_document = document.Document.from_gcs( + gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix + ) + elif gcs_uri: + # Load a single Document from a Google Cloud Storage URI + wrapped_document = document.Document.from_gcs_uri(gcs_uri=gcs_uri) + elif document_path: + # Load from local `Document` JSON file + wrapped_document = document.Document.from_document_path(document_path) + elif documentai_document: + # Load from `documentai.Document` object + wrapped_document = document.Document.from_documentai_document( + documentai_document + ) + elif batch_process_metadata: + # Load Documents from `BatchProcessMetadata` object + wrapped_documents = document.Document.from_batch_process_metadata( + metadata=batch_process_metadata + ) + wrapped_document = wrapped_documents[0] + elif batch_process_operation: + wrapped_documents = document.Document.from_batch_process_operation( + location="us", operation_name=batch_process_operation + ) + wrapped_document = wrapped_documents[0] + else: + raise ValueError("No document source provided.") -def quickstart_sample(gcs_bucket_name: str, gcs_prefix: str) -> None: - print("Document structure in Cloud Storage") - gcs_utilities.print_gcs_document_tree( - gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix - ) - - wrapped_document = document.Document.from_gcs( - gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix - ) # For all properties and methods, refer to: # https://cloud.google.com/python/docs/reference/documentai-toolbox/latest/google.cloud.documentai_toolbox.wrappers.document.Document - # Alternatively, create wrapped document from: - # - # - Local `Document` JSON file: `document.Document.from_document_path()` - # - `Document` object: `document.Document.from_documentai_document()` - # - `BatchProcessMetadata`: `document.Document.from_batch_process_metadata()` - # - Batch Processing Operation: `document.Document.from_batch_process_operation()` - print("Document Successfully Loaded!") print(f"\t Number of Pages: {len(wrapped_document.pages)}") print(f"\t Number of Entities: {len(wrapped_document.entities)}") - for idx, page in enumerate(wrapped_document.pages): - print(f"Page {idx}") + for page in wrapped_document.pages: + print(f"Page {page.page_number}") for block in page.blocks: print(block.text) for paragraph in page.paragraphs: diff --git a/samples/snippets/test_quickstart_sample.py b/samples/snippets/test_quickstart_sample.py index 912a27d8..cb7a9c4a 100644 --- a/samples/snippets/test_quickstart_sample.py +++ b/samples/snippets/test_quickstart_sample.py @@ -18,18 +18,105 @@ import pytest from samples.snippets import quickstart_sample +from google.cloud import documentai +from google.longrunning.operations_pb2 import ListOperationsRequest # type: ignore + location = "us" project_id = os.environ["GOOGLE_CLOUD_PROJECT"] -gcs_bucket_name = "documentai_toolbox_samples" -gcs_input_uri = "output/123456789/0" -def test_quickstart_sample(capsys: pytest.CaptureFixture) -> None: +def test_quickstart_sample_gcs_bucket_prefix(capsys: pytest.CaptureFixture) -> None: + gcs_bucket_name = "documentai_toolbox_samples" + gcs_prefix = "output/123456789/0" quickstart_sample.quickstart_sample( - gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_input_uri + gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix ) out, _ = capsys.readouterr() assert "Document structure in Cloud Storage" in out assert "Number of Pages: 1" in out assert "Number of Entities: 35" in out + + +def test_quickstart_sample_gcs_uri(capsys: pytest.CaptureFixture) -> None: + gcs_uri = ( + "gs://documentai_toolbox_samples/output/123456789/0/toolbox_invoice_test-0.json" + ) + quickstart_sample.quickstart_sample(gcs_uri=gcs_uri) + out, _ = capsys.readouterr() + + assert "Number of Pages: 1" in out + assert "Number of Entities: 35" in out + + +def test_quickstart_sample_document_path(capsys: pytest.CaptureFixture) -> None: + document_path = "resources/form_with_tables.json" + quickstart_sample.quickstart_sample(document_path=document_path) + out, _ = capsys.readouterr() + + assert "Number of Pages: 1" in out + assert "Number of Entities: 0" in out + assert "Form Date" in out + + +def test_quickstart_sample_documentai_document(capsys: pytest.CaptureFixture) -> None: + with open("resources/form_with_tables.json", encoding="utf-8") as f: + documentai_document = documentai.Document.from_json( + f.read(), ignore_unknown_fields=True + ) + + quickstart_sample.quickstart_sample(documentai_document=documentai_document) + out, _ = capsys.readouterr() + + assert "Number of Pages: 1" in out + assert "Number of Entities: 0" in out + assert "Form Date" in out + + +def test_quickstart_sample_batch_process_metadata( + capsys: pytest.CaptureFixture, +) -> None: + client = documentai.DocumentProcessorServiceClient() + name = f"{client.common_location_path(project=project_id, location=location)}/operations" + response = client.list_operations( + request=ListOperationsRequest( + name=name, + filter="TYPE=BATCH_PROCESS_DOCUMENTS AND STATE=DONE", + page_size=1, + ) + ) + batch_process_metadata = documentai.BatchProcessMetadata.deserialize( + response.operations[0].metadata.value + ) + + quickstart_sample.quickstart_sample(batch_process_metadata=batch_process_metadata) + + out, _ = capsys.readouterr() + + assert "Document Successfully Loaded!" in out + + +def test_quickstart_sample_batch_process_operation( + capsys: pytest.CaptureFixture, +) -> None: + client = documentai.DocumentProcessorServiceClient() + name = f"{client.common_location_path(project=project_id, location=location)}/operations" + response = client.list_operations( + request=ListOperationsRequest( + name=name, + filter="TYPE=BATCH_PROCESS_DOCUMENTS AND STATE=DONE", + page_size=1, + ) + ) + batch_process_operation = response.operations[0].name + + quickstart_sample.quickstart_sample(batch_process_operation=batch_process_operation) + + out, _ = capsys.readouterr() + + assert "Document Successfully Loaded!" in out + + +def test_quickstart_sample_no_input() -> None: + with pytest.raises(ValueError, match="No document source provided."): + quickstart_sample.quickstart_sample() diff --git a/tests/unit/test_document.py b/tests/unit/test_document.py index 19c36ca4..e5ef5f1f 100644 --- a/tests/unit/test_document.py +++ b/tests/unit/test_document.py @@ -317,6 +317,16 @@ def test_get_batch_process_metadata_with_invalid_metadata_type(mock_docai): document._get_batch_process_metadata(operation_name) +def test_get_batch_process_metadata_with_invalid_operation_name(): + with pytest.raises( + ValueError, + match="Invalid Operation Name", + ): + document._get_batch_process_metadata( + "projects//locations/us/operations/7890123" + ) + + def test_bigquery_column_name(): string_map = { "Phone #:": "phone_num",