diff --git a/google/cloud/documentai_toolbox/utilities/gcs_utilities.py b/google/cloud/documentai_toolbox/utilities/gcs_utilities.py index 077ca19a..8dd95549 100644 --- a/google/cloud/documentai_toolbox/utilities/gcs_utilities.py +++ b/google/cloud/documentai_toolbox/utilities/gcs_utilities.py @@ -253,7 +253,7 @@ def print_gcs_document_tree( ) for directory, files in path_list.items(): - print(directory) + print(create_gcs_uri(gcs_bucket_name, directory)) dir_size = len(files) for idx, file_name in enumerate(files): if idx == dir_size - 1: diff --git a/google/cloud/documentai_toolbox/wrappers/document.py b/google/cloud/documentai_toolbox/wrappers/document.py index bface9af..ea11f95e 100644 --- a/google/cloud/documentai_toolbox/wrappers/document.py +++ b/google/cloud/documentai_toolbox/wrappers/document.py @@ -485,7 +485,7 @@ def from_gcs( gcs_prefix: str, gcs_input_uri: Optional[str] = None, ) -> "Document": - r"""Loads a Document from a Cloud Storage directory. + r"""Loads a single Document from a Cloud Storage directory. Args: gcs_bucket_name (str): @@ -514,6 +514,35 @@ def from_gcs( gcs_input_uri=gcs_input_uri, ) + @classmethod + def from_gcs_multi( + cls: Type["Document"], + gcs_bucket_name: str, + gcs_prefix: str, + ) -> List["Document"]: + r"""Loads a list of Documents from a Cloud Storage directory. + + Args: + gcs_bucket_name (str): + Required. The gcs bucket. + + Format: Given `gs://{bucket_name}/{optional_folder}/{target_folder}` where `gcs_bucket_name={bucket_name}`. + gcs_prefix (str): + Required. The prefix to the location of the target folder. + + Format: Given `gs://{bucket_name}/{optional_folder}/{target_folder}` where `gcs_prefix={optional_folder}/{target_folder}`. + Returns: + List[Document]: + A List of documents from gcs. + """ + return [ + Document.from_gcs(gcs_bucket_name=gcs_bucket_name, gcs_prefix=directory) + for directory, files in gcs_utilities.list_gcs_document_tree( + gcs_bucket_name, gcs_prefix + ).items() + if files != [""] + ] + @classmethod def from_gcs_uri( cls: Type["Document"], diff --git a/tests/unit/test_document.py b/tests/unit/test_document.py index 286a482d..ed946d7a 100644 --- a/tests/unit/test_document.py +++ b/tests/unit/test_document.py @@ -27,11 +27,12 @@ import glob -from google.cloud.vision import AnnotateFileResponse import pytest from google.cloud import documentai +from google.cloud.storage import Blob from google.cloud.documentai_toolbox import document, gcs_utilities +from google.cloud.vision import AnnotateFileResponse def get_bytes(file_name): @@ -435,6 +436,37 @@ def test_document_from_gcs_uri(get_blob_mock): assert len(actual.text) > 0 +@mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage") +def test_document_list_from_gcs_with_multiple_input_files( + mock_storage, + get_bytes_multiple_directories_mock, +): + client = mock_storage.Client.return_value + + mock_bucket = mock.Mock() + + client.Bucket.return_value = mock_bucket + + client.list_blobs.return_value = [ + Blob(name="documentai/output/123456789/1/test_shard1.json", bucket=None), + Blob(name="documentai/output/123456789/1/test_shard2.json", bucket=None), + Blob(name="documentai/output/123456789/2/test_shard3.json", bucket=None), + ] + documents = document.Document.from_gcs_multi( + gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/" + ) + get_bytes_multiple_directories_mock.assert_called() + assert get_bytes_multiple_directories_mock.call_count == 2 + + assert len(documents) == 2 + + assert documents[0].gcs_bucket_name == "test-directory" + assert documents[0].gcs_prefix == "documentai/output/123456789/1" + + assert documents[1].gcs_bucket_name == "test-directory" + assert documents[1].gcs_prefix == "documentai/output/123456789/2" + + def test_document_from_batch_process_metadata_with_multiple_input_files( get_bytes_multiple_directories_mock, ): diff --git a/tests/unit/test_gcs_utilities.py b/tests/unit/test_gcs_utilities.py index e0815aca..c3600117 100644 --- a/tests/unit/test_gcs_utilities.py +++ b/tests/unit/test_gcs_utilities.py @@ -92,16 +92,16 @@ def test_list_gcs_document_tree_with_one_folder(mock_storage): blobs = [ storage.Blob( - name="gs://test-directory/1/test_shard1.json", - bucket="gs://test-directory/1", + name="1/test_shard1.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/1/test_shard2.json", - bucket="gs://test-directory/1", + name="1/test_shard2.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/1/test_shard3.json", - bucket="gs://test-directory/1", + name="1/test_shard3.json", + bucket=mock_bucket, ), ] @@ -113,11 +113,13 @@ def test_list_gcs_document_tree_with_one_folder(mock_storage): mock_storage.Client.assert_called_once() - assert "gs://test-directory/1" in list(doc_list.keys()) + assert doc_list == { + "1": ["test_shard1.json", "test_shard2.json", "test_shard3.json"] + } @mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage") -def test_list_gcs_document_tree_with_3_documents(mock_storage, capfd): +def test_list_gcs_document_tree_with_3_documents(mock_storage): client = mock_storage.Client.return_value mock_bucket = mock.Mock() @@ -126,16 +128,16 @@ def test_list_gcs_document_tree_with_3_documents(mock_storage, capfd): blobs = [ storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard1.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard1.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard2.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard2.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard3.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard3.json", + bucket=mock_bucket, ), ] @@ -147,13 +149,17 @@ def test_list_gcs_document_tree_with_3_documents(mock_storage, capfd): mock_storage.Client.assert_called_once() - out, err = capfd.readouterr() - - assert "gs://test-directory/documentai/output/123456789/1" in list(doc_list.keys()) + assert doc_list == { + "documentai/output/123456789/1": [ + "test_shard1.json", + "test_shard2.json", + "test_shard3.json", + ] + } @mock.patch("google.cloud.documentai_toolbox.utilities.gcs_utilities.storage") -def test_list_gcs_document_tree_with_more_than_5_document(mock_storage, capfd): +def test_list_gcs_document_tree_with_more_than_5_document(mock_storage): client = mock_storage.Client.return_value mock_bucket = mock.Mock() @@ -162,28 +168,28 @@ def test_list_gcs_document_tree_with_more_than_5_document(mock_storage, capfd): blobs = [ storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard1.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard1.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard2.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard2.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard3.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard3.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard4.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard4.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard5.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard5.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard6.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard6.json", + bucket=mock_bucket, ), ] client.list_blobs.return_value = blobs @@ -194,9 +200,16 @@ def test_list_gcs_document_tree_with_more_than_5_document(mock_storage, capfd): mock_storage.Client.assert_called_once() - out, err = capfd.readouterr() - - assert "gs://test-directory/documentai/output/123456789/1" in list(doc_list.keys()) + assert doc_list == { + "documentai/output/123456789/1": [ + "test_shard1.json", + "test_shard2.json", + "test_shard3.json", + "test_shard4.json", + "test_shard5.json", + "test_shard6.json", + ] + } def test_list_gcs_document_tree_with_gcs_uri_contains_file_type(): @@ -217,16 +230,16 @@ def test_print_gcs_document_tree_with_one_folder(mock_storage, capfd): blobs = [ storage.Blob( - name="gs://test-directory/1/test_shard1.json", - bucket="gs://test-directory/1", + name="1/test_shard1.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/1/test_shard2.json", - bucket="gs://test-directory/1", + name="1/test_shard2.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/1/test_shard3.json", - bucket="gs://test-directory/1", + name="1/test_shard3.json", + bucket=mock_bucket, ), ] @@ -239,6 +252,7 @@ def test_print_gcs_document_tree_with_one_folder(mock_storage, capfd): mock_storage.Client.assert_called_once() out, err = capfd.readouterr() + assert not err assert ( out == """gs://test-directory/1 @@ -258,16 +272,16 @@ def test_print_gcs_document_tree_with_3_documents(mock_storage, capfd): blobs = [ storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard1.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard1.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard2.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard2.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard3.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard3.json", + bucket=mock_bucket, ), ] @@ -280,6 +294,7 @@ def test_print_gcs_document_tree_with_3_documents(mock_storage, capfd): mock_storage.Client.assert_called_once() out, err = capfd.readouterr() + assert not err assert ( out == """gs://test-directory/documentai/output/123456789/1 @@ -299,28 +314,28 @@ def test_print_gcs_document_tree_with_more_than_5_document(mock_storage, capfd): blobs = [ storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard1.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard1.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard2.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard2.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard3.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard3.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard4.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard4.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard5.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard5.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard6.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard6.json", + bucket=mock_bucket, ), ] client.list_blobs.return_value = blobs @@ -332,6 +347,7 @@ def test_print_gcs_document_tree_with_more_than_5_document(mock_storage, capfd): mock_storage.Client.assert_called_once() out, err = capfd.readouterr() + assert not err assert ( out == """gs://test-directory/documentai/output/123456789/1 @@ -355,28 +371,28 @@ def test_print_gcs_document_tree_with_multiple_directories(mock_storage, capfd): blobs = [ storage.Blob( - name="gs://test-directory/documentai/output/123456789/0/test_shard1.json", - bucket="gs://test-directory/documentai/output/123456789/0", + name="documentai/output/123456789/0/test_shard1.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/0/test_shard2.json", - bucket="gs://test-directory/documentai/output/123456789/0", + name="documentai/output/123456789/0/test_shard2.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard3.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard3.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard4.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard4.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard5.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard5.json", + bucket=mock_bucket, ), storage.Blob( - name="gs://test-directory/documentai/output/123456789/1/test_shard6.json", - bucket="gs://test-directory/documentai/output/123456789/1", + name="documentai/output/123456789/1/test_shard6.json", + bucket=mock_bucket, ), ] client.list_blobs.return_value = blobs @@ -388,6 +404,7 @@ def test_print_gcs_document_tree_with_multiple_directories(mock_storage, capfd): mock_storage.Client.assert_called_once() out, err = capfd.readouterr() + assert not err assert ( out == """gs://test-directory/documentai/output/123456789/0 @@ -428,6 +445,7 @@ def test_create_batches_with_empty_directory(mock_storage, capfd): mock_storage.Client.assert_called_once() out, err = capfd.readouterr() + assert not err assert out == "" assert len(actual) == 0 @@ -454,6 +472,7 @@ def test_create_batches_with_3_documents(mock_storage, capfd): mock_storage.Client.assert_called_once() out, err = capfd.readouterr() + assert not err assert out == "" assert len(actual) == 1 assert len(actual[0].gcs_documents.documents) == 3 @@ -491,6 +510,7 @@ def test_create_batches_with_large_folder(mock_storage, capfd): mock_storage.Client.assert_called_once() out, err = capfd.readouterr() + assert not err assert out == "" assert len(actual) == 2 assert len(actual[0].gcs_documents.documents) == 50 @@ -516,6 +536,7 @@ def test_create_batches_with_invalid_file_type(mock_storage, capfd): mock_storage.Client.assert_called_once() out, err = capfd.readouterr() + assert not err assert "Invalid Mime Type" in out assert not actual @@ -539,6 +560,7 @@ def test_create_batches_with_large_file(mock_storage, capfd): mock_storage.Client.assert_called_once() out, err = capfd.readouterr() + assert not err assert "File size must be less than" in out assert not actual