From 793763b98781228a894d7a5072c29a9c5b2ef7e3 Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Tue, 12 Dec 2023 10:40:10 -0600 Subject: [PATCH 1/5] feat: Added `Document.list_from_gcs()` method to get multiple wrapped documents from a GCS directory. Fixes #214 - Note: `from_gcs()` takes in a GCS directory, but it only works for a single sharded document from a single input document source. - In a GA release, it would be a better practice to have `from_gcs()` take in any GCS directory and output a list of Wrapped Documents. But this would be a backwards-incompatible change now. - Not sure if it's possible/advisable to have two possible return types for `from_gcs()` and just have it return a list when there are multiple Wrapped Documents? --- .../documentai_toolbox/wrappers/document.py | 29 +++++++++++++++++ tests/unit/test_document.py | 32 +++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/google/cloud/documentai_toolbox/wrappers/document.py b/google/cloud/documentai_toolbox/wrappers/document.py index 3de68230..0252ab23 100644 --- a/google/cloud/documentai_toolbox/wrappers/document.py +++ b/google/cloud/documentai_toolbox/wrappers/document.py @@ -481,6 +481,35 @@ def from_gcs( gcs_input_uri=gcs_input_uri, ) + @classmethod + def list_from_gcs( + cls: Type["Document"], + gcs_bucket_name: str, + gcs_prefix: str, + ) -> List["Document"]: + r"""Loads a list of Documents from Cloud Storage. + + Args: + gcs_bucket_name (str): + Required. The gcs bucket. + + Format: Given `gs://{bucket_name}/{optional_folder}/{operation_id}/` where `gcs_bucket_name={bucket_name}`. + gcs_prefix (str): + Required. The prefix to the location of the target folder. + + Format: Given `gs://{bucket_name}/{optional_folder}/{target_folder}` where `gcs_prefix={optional_folder}/{target_folder}`. + Returns: + List[Document]: + A List of documents from gcs. + """ + return [ + Document.from_gcs(gcs_bucket_name=gcs_bucket_name, gcs_prefix=directory) + for directory, files in gcs_utilities.list_gcs_document_tree( + gcs_bucket_name, gcs_prefix + ).items() + if files != [""] + ] + @classmethod def from_batch_process_metadata( cls: Type["Document"], metadata: documentai.BatchProcessMetadata diff --git a/tests/unit/test_document.py b/tests/unit/test_document.py index c7c80a8d..da2ba8be 100644 --- a/tests/unit/test_document.py +++ b/tests/unit/test_document.py @@ -30,6 +30,7 @@ import pytest from google.cloud import documentai +from google.cloud.storage import Blob, Bucket from google.cloud.documentai_toolbox import document, gcs_utilities @@ -397,6 +398,37 @@ def test_document_from_gcs_with_unordered_shards(get_bytes_unordered_files_mock) assert page.page_number == page_index + 1 +@mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage") +def test_document_list_from_gcs_with_multiple_input_files( + mock_storage, + get_bytes_multiple_directories_mock, +): + client = mock_storage.Client.return_value + + mock_bucket = mock.Mock() + + client.Bucket.return_value = mock_bucket + + client.list_blobs.return_value = [ + Blob(name="documentai/output/123456789/1/test_shard1.json", bucket=None), + Blob(name="documentai/output/123456789/1/test_shard2.json", bucket=None), + Blob(name="documentai/output/123456789/2/test_shard3.json", bucket=None), + ] + documents = document.Document.list_from_gcs( + gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/" + ) + get_bytes_multiple_directories_mock.assert_called() + assert get_bytes_multiple_directories_mock.call_count == 2 + + assert len(documents) == 2 + + assert documents[0].gcs_bucket_name == "test-directory" + assert documents[0].gcs_prefix == "documentai/output/123456789/1" + + assert documents[1].gcs_bucket_name == "test-directory" + assert documents[1].gcs_prefix == "documentai/output/123456789/2" + + def test_document_from_batch_process_metadata_with_multiple_input_files( get_bytes_multiple_directories_mock, ): From 0bb2213c74eb76c1002a30332ee3ca5489fa5e28 Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Tue, 12 Dec 2023 10:58:07 -0600 Subject: [PATCH 2/5] fix: Fix inaccurate `gcs_utilities` tests. - Input data doesn't match what the actual `storage.list_blobs()` method outputs. - Updated output in `print_gcs_document_tree()` to match the test cases, as this was the intended structure. --- .../utilities/gcs_utilities.py | 2 +- tests/unit/test_document.py | 4 +- tests/unit/test_gcs_utilities.py | 160 ++++++++++-------- 3 files changed, 94 insertions(+), 72 deletions(-) diff --git a/google/cloud/documentai_toolbox/utilities/gcs_utilities.py b/google/cloud/documentai_toolbox/utilities/gcs_utilities.py index 8583a779..681225dd 100644 --- a/google/cloud/documentai_toolbox/utilities/gcs_utilities.py +++ b/google/cloud/documentai_toolbox/utilities/gcs_utilities.py @@ -257,7 +257,7 @@ def print_gcs_document_tree( ) for directory, files in path_list.items(): - print(directory) + print(create_gcs_uri(gcs_bucket_name, directory)) dir_size = len(files) for idx, file_name in enumerate(files): if idx == dir_size - 1: diff --git a/tests/unit/test_document.py b/tests/unit/test_document.py index da2ba8be..e7acb8de 100644 --- a/tests/unit/test_document.py +++ b/tests/unit/test_document.py @@ -26,12 +26,12 @@ import glob -from google.cloud.vision import AnnotateFileResponse import pytest from google.cloud import documentai -from google.cloud.storage import Blob, Bucket +from google.cloud.storage import Blob from google.cloud.documentai_toolbox import document, gcs_utilities +from google.cloud.vision import AnnotateFileResponse def get_bytes(file_name): diff --git a/tests/unit/test_gcs_utilities.py b/tests/unit/test_gcs_utilities.py index e0815aca..c3600117 100644 --- a/tests/unit/test_gcs_utilities.py +++ b/tests/unit/test_gcs_utilities.py @@ -92,16 +92,16 @@ def test_list_gcs_document_tree_with_one_folder(mock_storage): blobs = [ storage.Blob( - name="gs://test-directory/1/test_shard1.json", - bucket="gs://test-directory/1", + name="1/test_shard1.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/1/test_shard2.json", - bucket="gs://test-directory/1", + name="1/test_shard2.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/1/test_shard3.json", - bucket="gs://test-directory/1", + name="1/test_shard3.json", + bucket=mock_bucket, ), ] @@ -113,11 +113,13 @@ def test_list_gcs_document_tree_with_one_folder(mock_storage): mock_storage.Client.assert_called_once() - assert "gs://test-directory/1" in list(doc_list.keys()) + assert doc_list == { + "1": ["test_shard1.json", "test_shard2.json", "test_shard3.json"] + } @mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage") -def test_list_gcs_document_tree_with_3_documents(mock_storage, capfd): +def test_list_gcs_document_tree_with_3_documents(mock_storage): client = mock_storage.Client.return_value mock_bucket = mock.Mock() @@ -126,16 +128,16 @@ def test_list_gcs_document_tree_with_3_documents(mock_storage, capfd): blobs = [ storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard1.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard1.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard2.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard2.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard3.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard3.json", + bucket=mock_bucket, ), ] @@ -147,13 +149,17 @@ def test_list_gcs_document_tree_with_3_documents(mock_storage, capfd): mock_storage.Client.assert_called_once() - out, err = capfd.readouterr() - - assert "gs://test-directory/documentai/output/123456789/1" in list(doc_list.keys()) + assert doc_list == { + "documentai/output/123456789/1": [ + "test_shard1.json", + "test_shard2.json", + "test_shard3.json", + ] + } @mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage") -def test_list_gcs_document_tree_with_more_than_5_document(mock_storage, capfd): +def test_list_gcs_document_tree_with_more_than_5_document(mock_storage): client = mock_storage.Client.return_value mock_bucket = mock.Mock() @@ -162,28 +168,28 @@ def test_list_gcs_document_tree_with_more_than_5_document(mock_storage, capfd): blobs = [ storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard1.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard1.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard2.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard2.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard3.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard3.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard4.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard4.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard5.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard5.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard6.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard6.json", + bucket=mock_bucket, ), ] client.list_blobs.return_value = blobs @@ -194,9 +200,16 @@ def test_list_gcs_document_tree_with_more_than_5_document(mock_storage, capfd): mock_storage.Client.assert_called_once() - out, err = capfd.readouterr() - - assert "gs://test-directory/documentai/output/123456789/1" in list(doc_list.keys()) + assert doc_list == { + "documentai/output/123456789/1": [ + "test_shard1.json", + "test_shard2.json", + "test_shard3.json", + "test_shard4.json", + "test_shard5.json", + "test_shard6.json", + ] + } def test_list_gcs_document_tree_with_gcs_uri_contains_file_type(): @@ -217,16 +230,16 @@ def test_print_gcs_document_tree_with_one_folder(mock_storage, capfd): blobs = [ storage.Blob( - name="gs://test-directory/1/test_shard1.json", - bucket="gs://test-directory/1", + name="1/test_shard1.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/1/test_shard2.json", - bucket="gs://test-directory/1", + name="1/test_shard2.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/1/test_shard3.json", - bucket="gs://test-directory/1", + name="1/test_shard3.json", + bucket=mock_bucket, ), ] @@ -239,6 +252,7 @@ def test_print_gcs_document_tree_with_one_folder(mock_storage, capfd): mock_storage.Client.assert_called_once() out, err = capfd.readouterr() + assert not err assert ( out == """gs://test-directory/1 @@ -258,16 +272,16 @@ def test_print_gcs_document_tree_with_3_documents(mock_storage, capfd): blobs = [ storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard1.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard1.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard2.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard2.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard3.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard3.json", + bucket=mock_bucket, ), ] @@ -280,6 +294,7 @@ def test_print_gcs_document_tree_with_3_documents(mock_storage, capfd): mock_storage.Client.assert_called_once() out, err = capfd.readouterr() + assert not err assert ( out == """gs://test-directory/documentai/output/123456789/1 @@ -299,28 +314,28 @@ def test_print_gcs_document_tree_with_more_than_5_document(mock_storage, capfd): blobs = [ storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard1.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard1.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard2.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard2.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard3.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard3.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard4.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard4.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard5.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard5.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard6.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard6.json", + bucket=mock_bucket, ), ] client.list_blobs.return_value = blobs @@ -332,6 +347,7 @@ def test_print_gcs_document_tree_with_more_than_5_document(mock_storage, capfd): mock_storage.Client.assert_called_once() out, err = capfd.readouterr() + assert not err assert ( out == """gs://test-directory/documentai/output/123456789/1 @@ -355,28 +371,28 @@ def test_print_gcs_document_tree_with_multiple_directories(mock_storage, capfd): blobs = [ storage.Blob( - name="gs://test-directory/documentai/output/123456789/0/test_shard1.json", - bucket="gs://test-directory/documentai/output/123456789/0", + name="documentai/output/123456789/0/test_shard1.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/0/test_shard2.json", - bucket="gs://test-directory/documentai/output/123456789/0", + name="documentai/output/123456789/0/test_shard2.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard3.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard3.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard4.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard4.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard5.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard5.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard6.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard6.json", + bucket=mock_bucket, ), ] client.list_blobs.return_value = blobs @@ -388,6 +404,7 @@ def test_print_gcs_document_tree_with_multiple_directories(mock_storage, capfd): mock_storage.Client.assert_called_once() out, err = capfd.readouterr() + assert not err assert ( out == """gs://test-directory/documentai/output/123456789/0 @@ -428,6 +445,7 @@ def test_create_batches_with_empty_directory(mock_storage, capfd): mock_storage.Client.assert_called_once() out, err = capfd.readouterr() + assert not err assert out == "" assert len(actual) == 0 @@ -454,6 +472,7 @@ def test_create_batches_with_3_documents(mock_storage, capfd): mock_storage.Client.assert_called_once() out, err = capfd.readouterr() + assert not err assert out == "" assert len(actual) == 1 assert len(actual[0].gcs_documents.documents) == 3 @@ -491,6 +510,7 @@ def test_create_batches_with_large_folder(mock_storage, capfd): mock_storage.Client.assert_called_once() out, err = capfd.readouterr() + assert not err assert out == "" assert len(actual) == 2 assert len(actual[0].gcs_documents.documents) == 50 @@ -516,6 +536,7 @@ def test_create_batches_with_invalid_file_type(mock_storage, capfd): mock_storage.Client.assert_called_once() out, err = capfd.readouterr() + assert not err assert "Invalid Mime Type" in out assert not actual @@ -539,6 +560,7 @@ def test_create_batches_with_large_file(mock_storage, capfd): mock_storage.Client.assert_called_once() out, err = capfd.readouterr() + assert not err assert "File size must be less than" in out assert not actual From 7cd3ac6fb941c74ab56ec2243d11d2f65de218a5 Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Tue, 12 Dec 2023 13:13:39 -0600 Subject: [PATCH 3/5] Renamed function to `from_gcs_multi()` --- google/cloud/documentai_toolbox/wrappers/document.py | 4 ++-- tests/unit/test_document.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/google/cloud/documentai_toolbox/wrappers/document.py b/google/cloud/documentai_toolbox/wrappers/document.py index 0252ab23..2d949169 100644 --- a/google/cloud/documentai_toolbox/wrappers/document.py +++ b/google/cloud/documentai_toolbox/wrappers/document.py @@ -482,7 +482,7 @@ def from_gcs( ) @classmethod - def list_from_gcs( + def from_gcs_multi( cls: Type["Document"], gcs_bucket_name: str, gcs_prefix: str, @@ -493,7 +493,7 @@ def list_from_gcs( gcs_bucket_name (str): Required. The gcs bucket. - Format: Given `gs://{bucket_name}/{optional_folder}/{operation_id}/` where `gcs_bucket_name={bucket_name}`. + Format: Given `gs://{bucket_name}/{optional_folder}/{target_folder}` where `gcs_bucket_name={bucket_name}`. gcs_prefix (str): Required. The prefix to the location of the target folder. diff --git a/tests/unit/test_document.py b/tests/unit/test_document.py index e7acb8de..c0ef0396 100644 --- a/tests/unit/test_document.py +++ b/tests/unit/test_document.py @@ -414,7 +414,7 @@ def test_document_list_from_gcs_with_multiple_input_files( Blob(name="documentai/output/123456789/1/test_shard2.json", bucket=None), Blob(name="documentai/output/123456789/2/test_shard3.json", bucket=None), ] - documents = document.Document.list_from_gcs( + documents = document.Document.from_gcs_multi( gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/" ) get_bytes_multiple_directories_mock.assert_called() From 65d6b299a5ab3f2f8d447a9b777e5941fe951f47 Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Mon, 4 Mar 2024 12:28:40 -0600 Subject: [PATCH 4/5] Fix merge issue --- google/cloud/documentai_toolbox/wrappers/document.py | 1 + 1 file changed, 1 insertion(+) diff --git a/google/cloud/documentai_toolbox/wrappers/document.py b/google/cloud/documentai_toolbox/wrappers/document.py index d7ac7c20..8e6109aa 100644 --- a/google/cloud/documentai_toolbox/wrappers/document.py +++ b/google/cloud/documentai_toolbox/wrappers/document.py @@ -541,6 +541,7 @@ def from_gcs_multi( if files != [""] ] + @classmethod def from_gcs_uri( cls: Type["Document"], gcs_uri: str, From ef302eb31466323e968584dc33d5baa3cf572e58 Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Mon, 11 Mar 2024 12:12:20 -0500 Subject: [PATCH 5/5] Update docstrings to be more specific --- google/cloud/documentai_toolbox/wrappers/document.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/google/cloud/documentai_toolbox/wrappers/document.py b/google/cloud/documentai_toolbox/wrappers/document.py index d7548b39..ea11f95e 100644 --- a/google/cloud/documentai_toolbox/wrappers/document.py +++ b/google/cloud/documentai_toolbox/wrappers/document.py @@ -485,7 +485,7 @@ def from_gcs( gcs_prefix: str, gcs_input_uri: Optional[str] = None, ) -> "Document": - r"""Loads a Document from a Cloud Storage directory. + r"""Loads a single Document from a Cloud Storage directory. Args: gcs_bucket_name (str): @@ -520,7 +520,7 @@ def from_gcs_multi( gcs_bucket_name: str, gcs_prefix: str, ) -> List["Document"]: - r"""Loads a list of Documents from Cloud Storage. + r"""Loads a list of Documents from a Cloud Storage directory. Args: gcs_bucket_name (str):