From b4762e8212e9e435eaa430bcd345291c69e518ac Mon Sep 17 00:00:00 2001 From: Holt Skinner <13262395+holtskinner@users.noreply.github.com> Date: Fri, 8 Mar 2024 12:32:45 -0600 Subject: [PATCH 1/2] fix: Add trailing slash if not present for `gcs_prefix` in `Document.from_gcs()` to cover matching prefixes edge case. (#274) * fix: Add trailing slash if not present for `gcs_prefix` in `Document.from_gcs()` to cover matching prefixes edge case. * Added Tests for GCS Matching Prefixes --- .../documentai_toolbox/wrappers/document.py | 2 + samples/snippets/quickstart_sample.py | 5 ++- samples/snippets/test_quickstart_sample.py | 26 +++++++++++++ tests/unit/test_document.py | 39 +++++++++++++++++-- 4 files changed, 66 insertions(+), 6 deletions(-) diff --git a/google/cloud/documentai_toolbox/wrappers/document.py b/google/cloud/documentai_toolbox/wrappers/document.py index 7818a2fa..bface9af 100644 --- a/google/cloud/documentai_toolbox/wrappers/document.py +++ b/google/cloud/documentai_toolbox/wrappers/document.py @@ -504,6 +504,8 @@ def from_gcs( Document: A document from gcs. """ + # Add trailing slash if not present. + gcs_prefix = gcs_prefix.rstrip("/") + "/" shards = _get_shards(gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix) return cls( shards=shards, diff --git a/samples/snippets/quickstart_sample.py b/samples/snippets/quickstart_sample.py index 0cea14db..a387c438 100644 --- a/samples/snippets/quickstart_sample.py +++ b/samples/snippets/quickstart_sample.py @@ -52,7 +52,7 @@ def quickstart_sample( documentai_document: Optional[documentai.Document] = None, batch_process_metadata: Optional[documentai.BatchProcessMetadata] = None, batch_process_operation: Optional[str] = None, -) -> None: +) -> document.Document: if gcs_bucket_name and gcs_prefix: # Load from Google Cloud Storage Directory print("Document structure in Cloud Storage") @@ -128,5 +128,6 @@ def quickstart_sample( if entity.normalized_text: print(f"\tNormalized Text: {entity.normalized_text}") + # [END documentai_toolbox_quickstart] -# [END documentai_toolbox_quickstart] + return wrapped_document diff --git a/samples/snippets/test_quickstart_sample.py b/samples/snippets/test_quickstart_sample.py index cb7a9c4a..e1dd1370 100644 --- a/samples/snippets/test_quickstart_sample.py +++ b/samples/snippets/test_quickstart_sample.py @@ -96,6 +96,32 @@ def test_quickstart_sample_batch_process_metadata( assert "Document Successfully Loaded!" in out +def test_quickstart_sample_batch_process_metadata_matching_prefixes( + capsys: pytest.CaptureFixture, +) -> None: + batch_process_metadata = documentai.BatchProcessMetadata( + state=documentai.BatchProcessMetadata.State.SUCCEEDED, + individual_process_statuses=[ + documentai.BatchProcessMetadata.IndividualProcessStatus( + input_gcs_source="gs://test-directory/documentai/input.pdf", + output_gcs_destination="gs://documentai_toolbox_samples/output/matching-prefixes/1", + ), + documentai.BatchProcessMetadata.IndividualProcessStatus( + input_gcs_source="gs://test-directory/documentai/input.pdf", + output_gcs_destination="gs://documentai_toolbox_samples/output/matching-prefixes/11", + ), + ], + ) + wrapped_document = quickstart_sample.quickstart_sample( + batch_process_metadata=batch_process_metadata + ) + + assert wrapped_document.gcs_prefix == "output/matching-prefixes/1/" + out, _ = capsys.readouterr() + + assert "Document Successfully Loaded!" in out + + def test_quickstart_sample_batch_process_operation( capsys: pytest.CaptureFixture, ) -> None: diff --git a/tests/unit/test_document.py b/tests/unit/test_document.py index e5ef5f1f..bcf71ae5 100644 --- a/tests/unit/test_document.py +++ b/tests/unit/test_document.py @@ -222,7 +222,7 @@ def test_get_batch_process_metadata_with_valid_operation( individual_process_statuses=[ documentai.BatchProcessMetadata.IndividualProcessStatus( input_gcs_source="gs://test-directory/documentai/input.pdf", - output_gcs_destination="gs://test-directory/documentai/output/123456789/1/", + output_gcs_destination="gs://test-directory/documentai/output/123456789/1", ) ], ) @@ -256,7 +256,7 @@ def test_get_batch_process_metadata_with_running_operation( individual_process_statuses=[ documentai.BatchProcessMetadata.IndividualProcessStatus( input_gcs_source="gs://test-directory/documentai/input.pdf", - output_gcs_destination="gs://test-directory/documentai/output/123456789/1/", + output_gcs_destination="gs://test-directory/documentai/output/123456789/1", ) ], ) @@ -442,11 +442,11 @@ def test_document_from_batch_process_metadata_with_multiple_input_files( individual_process_statuses=[ mock.Mock( input_gcs_source="gs://test-directory/documentai/input.pdf", - output_gcs_destination="gs://test-directory/documentai/output/123456789/1/", + output_gcs_destination="gs://test-directory/documentai/output/123456789/1", ), mock.Mock( input_gcs_source="gs://test-directory/documentai/input2.pdf", - output_gcs_destination="gs://test-directory/documentai/output/123456789/2/", + output_gcs_destination="gs://test-directory/documentai/output/123456789/2", ), ], ) @@ -465,6 +465,37 @@ def test_document_from_batch_process_metadata_with_multiple_input_files( assert documents[1].gcs_input_uri == "gs://test-directory/documentai/input2.pdf" +def test_document_from_batch_process_metadata_with_multiple_input_files_matching_prefix( + get_bytes_multiple_directories_mock, +): + mock_metadata = mock.Mock( + state=documentai.BatchProcessMetadata.State.SUCCEEDED, + individual_process_statuses=[ + mock.Mock( + input_gcs_source="gs://test-directory/documentai/input.pdf", + output_gcs_destination="gs://test-directory/documentai/output/123456789/1", + ), + mock.Mock( + input_gcs_source="gs://test-directory/documentai/input2.pdf", + output_gcs_destination="gs://test-directory/documentai/output/123456789/11", + ), + ], + ) + documents = document.Document.from_batch_process_metadata(mock_metadata) + + get_bytes_multiple_directories_mock.assert_called() + assert get_bytes_multiple_directories_mock.call_count == 2 + assert len(documents) == 2 + + assert documents[0].gcs_bucket_name == "test-directory" + assert documents[0].gcs_prefix == "documentai/output/123456789/1/" + assert documents[0].gcs_input_uri == "gs://test-directory/documentai/input.pdf" + + assert documents[1].gcs_bucket_name == "test-directory" + assert documents[1].gcs_prefix == "documentai/output/123456789/11/" + assert documents[1].gcs_input_uri == "gs://test-directory/documentai/input2.pdf" + + def test_document_from_batch_process_metadata_with_failed_operation(): with pytest.raises( ValueError, From b91ac08ae1235b37e96aca72ae560ef8fd4235b7 Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Fri, 8 Mar 2024 18:36:11 +0000 Subject: [PATCH 2/2] chore(main): release 0.13.2-alpha (#275) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 7 +++++++ google/cloud/documentai_toolbox/version.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 35504bb0..87b0072a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Changelog +## [0.13.2-alpha](https://github.com/googleapis/python-documentai-toolbox/compare/v0.13.1-alpha...v0.13.2-alpha) (2024-03-08) + + +### Bug Fixes + +* Add trailing slash if not present for `gcs_prefix` in `Document.from_gcs()` to cover matching prefixes edge case. ([#274](https://github.com/googleapis/python-documentai-toolbox/issues/274)) ([b4762e8](https://github.com/googleapis/python-documentai-toolbox/commit/b4762e8212e9e435eaa430bcd345291c69e518ac)) + ## [0.13.1-alpha](https://github.com/googleapis/python-documentai-toolbox/compare/v0.13.0-alpha...v0.13.1-alpha) (2024-03-04) diff --git a/google/cloud/documentai_toolbox/version.py b/google/cloud/documentai_toolbox/version.py index 05db1d43..61e745bb 100644 --- a/google/cloud/documentai_toolbox/version.py +++ b/google/cloud/documentai_toolbox/version.py @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. # -__version__ = "0.13.1-alpha" +__version__ = "0.13.2-alpha"