diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml index d8a1bbca..51213ca0 100644 --- a/.github/.OwlBot.lock.yaml +++ b/.github/.OwlBot.lock.yaml @@ -13,5 +13,5 @@ # limitations under the License. docker: image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest - digest: sha256:5ea6d0ab82c956b50962f91d94e206d3921537ae5fe1549ec5326381d8905cfa -# created: 2024-01-15T16:32:08.142785673Z + digest: sha256:a0c4463fcfd9893fc172a3b3db2b6ac0c7b94ec6ad458c7dcea12d9693615ac3 +# created: 2024-02-17T12:21:23.177926195Z diff --git a/.kokoro/requirements.txt b/.kokoro/requirements.txt index bb3d6ca3..f80bdcd6 100644 --- a/.kokoro/requirements.txt +++ b/.kokoro/requirements.txt @@ -93,30 +93,39 @@ colorlog==6.7.0 \ # via # gcp-docuploader # nox -cryptography==41.0.6 \ - --hash=sha256:068bc551698c234742c40049e46840843f3d98ad7ce265fd2bd4ec0d11306596 \ - --hash=sha256:0f27acb55a4e77b9be8d550d762b0513ef3fc658cd3eb15110ebbcbd626db12c \ - --hash=sha256:2132d5865eea673fe6712c2ed5fb4fa49dba10768bb4cc798345748380ee3660 \ - --hash=sha256:3288acccef021e3c3c10d58933f44e8602cf04dba96d9796d70d537bb2f4bbc4 \ - --hash=sha256:35f3f288e83c3f6f10752467c48919a7a94b7d88cc00b0668372a0d2ad4f8ead \ - --hash=sha256:398ae1fc711b5eb78e977daa3cbf47cec20f2c08c5da129b7a296055fbb22aed \ - --hash=sha256:422e3e31d63743855e43e5a6fcc8b4acab860f560f9321b0ee6269cc7ed70cc3 \ - --hash=sha256:48783b7e2bef51224020efb61b42704207dde583d7e371ef8fc2a5fb6c0aabc7 \ - --hash=sha256:4d03186af98b1c01a4eda396b137f29e4e3fb0173e30f885e27acec8823c1b09 \ - --hash=sha256:5daeb18e7886a358064a68dbcaf441c036cbdb7da52ae744e7b9207b04d3908c \ - --hash=sha256:60e746b11b937911dc70d164060d28d273e31853bb359e2b2033c9e93e6f3c43 \ - --hash=sha256:742ae5e9a2310e9dade7932f9576606836ed174da3c7d26bc3d3ab4bd49b9f65 \ - --hash=sha256:7e00fb556bda398b99b0da289ce7053639d33b572847181d6483ad89835115f6 \ - --hash=sha256:85abd057699b98fce40b41737afb234fef05c67e116f6f3650782c10862c43da \ - --hash=sha256:8efb2af8d4ba9dbc9c9dd8f04d19a7abb5b49eab1f3694e7b5a16a5fc2856f5c \ - --hash=sha256:ae236bb8760c1e55b7a39b6d4d32d2279bc6c7c8500b7d5a13b6fb9fc97be35b \ - --hash=sha256:afda76d84b053923c27ede5edc1ed7d53e3c9f475ebaf63c68e69f1403c405a8 \ - --hash=sha256:b27a7fd4229abef715e064269d98a7e2909ebf92eb6912a9603c7e14c181928c \ - --hash=sha256:b648fe2a45e426aaee684ddca2632f62ec4613ef362f4d681a9a6283d10e079d \ - --hash=sha256:c5a550dc7a3b50b116323e3d376241829fd326ac47bc195e04eb33a8170902a9 \ - --hash=sha256:da46e2b5df770070412c46f87bac0849b8d685c5f2679771de277a422c7d0b86 \ - --hash=sha256:f39812f70fc5c71a15aa3c97b2bbe213c3f2a460b79bd21c40d033bb34a9bf36 \ - --hash=sha256:ff369dd19e8fe0528b02e8df9f2aeb2479f89b1270d90f96a63500afe9af5cae +cryptography==42.0.2 \ + --hash=sha256:087887e55e0b9c8724cf05361357875adb5c20dec27e5816b653492980d20380 \ + --hash=sha256:09a77e5b2e8ca732a19a90c5bca2d124621a1edb5438c5daa2d2738bfeb02589 \ + --hash=sha256:130c0f77022b2b9c99d8cebcdd834d81705f61c68e91ddd614ce74c657f8b3ea \ + --hash=sha256:141e2aa5ba100d3788c0ad7919b288f89d1fe015878b9659b307c9ef867d3a65 \ + --hash=sha256:28cb2c41f131a5758d6ba6a0504150d644054fd9f3203a1e8e8d7ac3aea7f73a \ + --hash=sha256:2f9f14185962e6a04ab32d1abe34eae8a9001569ee4edb64d2304bf0d65c53f3 \ + --hash=sha256:320948ab49883557a256eab46149df79435a22d2fefd6a66fe6946f1b9d9d008 \ + --hash=sha256:36d4b7c4be6411f58f60d9ce555a73df8406d484ba12a63549c88bd64f7967f1 \ + --hash=sha256:3b15c678f27d66d247132cbf13df2f75255627bcc9b6a570f7d2fd08e8c081d2 \ + --hash=sha256:3dbd37e14ce795b4af61b89b037d4bc157f2cb23e676fa16932185a04dfbf635 \ + --hash=sha256:4383b47f45b14459cab66048d384614019965ba6c1a1a141f11b5a551cace1b2 \ + --hash=sha256:44c95c0e96b3cb628e8452ec060413a49002a247b2b9938989e23a2c8291fc90 \ + --hash=sha256:4b063d3413f853e056161eb0c7724822a9740ad3caa24b8424d776cebf98e7ee \ + --hash=sha256:52ed9ebf8ac602385126c9a2fe951db36f2cb0c2538d22971487f89d0de4065a \ + --hash=sha256:55d1580e2d7e17f45d19d3b12098e352f3a37fe86d380bf45846ef257054b242 \ + --hash=sha256:5ef9bc3d046ce83c4bbf4c25e1e0547b9c441c01d30922d812e887dc5f125c12 \ + --hash=sha256:5fa82a26f92871eca593b53359c12ad7949772462f887c35edaf36f87953c0e2 \ + --hash=sha256:61321672b3ac7aade25c40449ccedbc6db72c7f5f0fdf34def5e2f8b51ca530d \ + --hash=sha256:701171f825dcab90969596ce2af253143b93b08f1a716d4b2a9d2db5084ef7be \ + --hash=sha256:841ec8af7a8491ac76ec5a9522226e287187a3107e12b7d686ad354bb78facee \ + --hash=sha256:8a06641fb07d4e8f6c7dda4fc3f8871d327803ab6542e33831c7ccfdcb4d0ad6 \ + --hash=sha256:8e88bb9eafbf6a4014d55fb222e7360eef53e613215085e65a13290577394529 \ + --hash=sha256:a00aee5d1b6c20620161984f8ab2ab69134466c51f58c052c11b076715e72929 \ + --hash=sha256:a047682d324ba56e61b7ea7c7299d51e61fd3bca7dad2ccc39b72bd0118d60a1 \ + --hash=sha256:a7ef8dd0bf2e1d0a27042b231a3baac6883cdd5557036f5e8df7139255feaac6 \ + --hash=sha256:ad28cff53f60d99a928dfcf1e861e0b2ceb2bc1f08a074fdd601b314e1cc9e0a \ + --hash=sha256:b9097a208875fc7bbeb1286d0125d90bdfed961f61f214d3f5be62cd4ed8a446 \ + --hash=sha256:b97fe7d7991c25e6a31e5d5e795986b18fbbb3107b873d5f3ae6dc9a103278e9 \ + --hash=sha256:e0ec52ba3c7f1b7d813cd52649a5b3ef1fc0d433219dc8c93827c57eab6cf888 \ + --hash=sha256:ea2c3ffb662fec8bbbfce5602e2c159ff097a4631d96235fcf0fb00e59e3ece4 \ + --hash=sha256:fa3dec4ba8fb6e662770b74f62f1a0c7d4e37e25b58b2bf2c1be4c95372b4a33 \ + --hash=sha256:fbeb725c9dc799a574518109336acccaf1303c30d45c075c665c0793c2f79a7f # via # gcp-releasetool # secretstorage diff --git a/CHANGELOG.md b/CHANGELOG.md index 58f7ecaa..0337823d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Changelog +## [0.13.0-alpha](https://github.com/googleapis/python-documentai-toolbox/compare/v0.12.2-alpha...v0.13.0-alpha) (2024-02-26) + + +### Features + +* Added `gcs_uri` parameter to `Document.from_gcs()` to allow importing of a single Document JSON ([#261](https://github.com/googleapis/python-documentai-toolbox/issues/261)) ([f654a5d](https://github.com/googleapis/python-documentai-toolbox/commit/f654a5dc13247ae4c5cd4505440c3ce3a8bbf71a)) + ## [0.12.2-alpha](https://github.com/googleapis/python-documentai-toolbox/compare/v0.12.1-alpha...v0.12.2-alpha) (2024-02-02) diff --git a/google/cloud/documentai_toolbox/utilities/gcs_utilities.py b/google/cloud/documentai_toolbox/utilities/gcs_utilities.py index 8583a779..0c5ff3e2 100644 --- a/google/cloud/documentai_toolbox/utilities/gcs_utilities.py +++ b/google/cloud/documentai_toolbox/utilities/gcs_utilities.py @@ -136,17 +136,13 @@ def get_blob( module (Optional[str]): Optional. The module for a custom user agent header. Returns: - List[storage.blob.Blob]: - A list of the blobs in the Cloud Storage path. + storage.blob.Blob: + The blob in the Cloud Storage path. """ - gcs_bucket_name, gcs_file_name = split_gcs_uri(gcs_uri) - - if not re.match(constants.FILE_CHECK_REGEX, gcs_file_name): + if not re.match(constants.FILE_CHECK_REGEX, gcs_uri): raise ValueError("gcs_uri must link to a single file.") - storage_client = _get_storage_client(module=module) - bucket = storage_client.bucket(bucket_name=gcs_bucket_name) - return bucket.get_blob(gcs_file_name) + return storage.Blob.from_string(gcs_uri, _get_storage_client(module=module)) def split_gcs_uri(gcs_uri: str) -> Tuple[str, str]: diff --git a/google/cloud/documentai_toolbox/version.py b/google/cloud/documentai_toolbox/version.py index 1502adbf..8ed9c592 100644 --- a/google/cloud/documentai_toolbox/version.py +++ b/google/cloud/documentai_toolbox/version.py @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. # -__version__ = "0.12.2-alpha" +__version__ = "0.13.0-alpha" diff --git a/google/cloud/documentai_toolbox/wrappers/document.py b/google/cloud/documentai_toolbox/wrappers/document.py index 6df97312..6a49ed49 100644 --- a/google/cloud/documentai_toolbox/wrappers/document.py +++ b/google/cloud/documentai_toolbox/wrappers/document.py @@ -366,6 +366,7 @@ class Document: shards: List[documentai.Document] = dataclasses.field(repr=False) gcs_bucket_name: Optional[str] = dataclasses.field(default=None, repr=False) gcs_prefix: Optional[str] = dataclasses.field(default=None, repr=False) + gcs_uri: Optional[str] = dataclasses.field(default=None, repr=False) gcs_input_uri: Optional[str] = dataclasses.field(default=None, repr=False) _pages: Optional[List[Page]] = dataclasses.field( @@ -463,7 +464,7 @@ def from_gcs( gcs_prefix: str, gcs_input_uri: Optional[str] = None, ) -> "Document": - r"""Loads Document from Cloud Storage. + r"""Loads a Document from a Cloud Storage directory. Args: gcs_bucket_name (str): @@ -490,6 +491,40 @@ def from_gcs( gcs_input_uri=gcs_input_uri, ) + @classmethod + def from_gcs_uri( + cls: Type["Document"], + gcs_uri: str, + gcs_input_uri: Optional[str] = None, + ) -> "Document": + r"""Loads a Document from a Cloud Storage uri. + + Args: + gcs_uri (str): + Required. The full GCS uri to a Document JSON file. + + Example: `gs://{bucket_name}/{optional_folder}/{target_file}.json`. + gcs_input_uri (str): + Optional. The gcs uri to the original input file. + + Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/{file_name}.pdf` + Returns: + Document: + A document from gcs. + """ + blob = gcs_utilities.get_blob(gcs_uri=gcs_uri, module="get-document") + shards = [ + documentai.Document.from_json( + blob.download_as_bytes(), + ignore_unknown_fields=True, + ) + ] + return cls( + shards=shards, + gcs_uri=gcs_uri, + gcs_input_uri=gcs_input_uri, + ) + @classmethod def from_batch_process_metadata( cls: Type["Document"], metadata: documentai.BatchProcessMetadata diff --git a/samples/snippets/requirements-test.txt b/samples/snippets/requirements-test.txt index d44a3b06..a989fc83 100644 --- a/samples/snippets/requirements-test.txt +++ b/samples/snippets/requirements-test.txt @@ -1,3 +1,3 @@ pytest==7.4.4 mock==5.1.0 -google-cloud-bigquery==3.17.1 +google-cloud-bigquery==3.17.2 diff --git a/samples/snippets/requirements.txt b/samples/snippets/requirements.txt index 99f2daf4..4c0a7afd 100644 --- a/samples/snippets/requirements.txt +++ b/samples/snippets/requirements.txt @@ -1,4 +1,4 @@ -google-cloud-bigquery==3.17.1 -google-cloud-documentai==2.21.1 +google-cloud-bigquery==3.17.2 +google-cloud-documentai==2.24.0 google-cloud-storage==2.14.0 -google-cloud-documentai-toolbox==0.11.1a0 +google-cloud-documentai-toolbox==0.12.2a0 diff --git a/tests/unit/test_document.py b/tests/unit/test_document.py index 31ac799e..19c36ca4 100644 --- a/tests/unit/test_document.py +++ b/tests/unit/test_document.py @@ -105,6 +105,17 @@ def get_bytes_missing_shard_mock(): yield byte_factory +@pytest.fixture +def get_blob_mock(): + with mock.patch.object(gcs_utilities, "get_blob") as blob_factory: + mock_blob = mock.Mock() + mock_blob.download_as_bytes.return_value = get_bytes("tests/unit/resources/0")[ + 0 + ] + blob_factory.return_value = mock_blob + yield blob_factory + + def create_document_with_images_without_bbox(get_bytes_images_mock): doc = document.Document.from_gcs( gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0" @@ -394,6 +405,25 @@ def test_document_from_gcs_with_unordered_shards(get_bytes_unordered_files_mock) assert page.page_number == page_index + 1 +def test_document_from_gcs_uri(get_blob_mock): + actual = document.Document.from_gcs_uri( + gcs_uri="gs://test-directory/documentai/output/123456789/0/document.json" + ) + + get_blob_mock.assert_called_once() + + assert ( + actual.gcs_uri + == "gs://test-directory/documentai/output/123456789/0/document.json" + ) + assert len(actual.pages) == 1 + # checking cached value + assert len(actual.pages) == 1 + + assert len(actual.text) > 0 + assert len(actual.text) > 0 + + def test_document_from_batch_process_metadata_with_multiple_input_files( get_bytes_multiple_directories_mock, ):