diff --git a/CHANGELOG.md b/CHANGELOG.md index 785fbd36..ffe533eb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog +## [0.6.0-alpha](https://github.com/googleapis/python-documentai-toolbox/compare/v0.5.0-alpha...v0.6.0-alpha) (2023-04-17) + + +### Features + +* Add blocks to PageWrapper ([#107](https://github.com/googleapis/python-documentai-toolbox/issues/107)) ([df7dfe7](https://github.com/googleapis/python-documentai-toolbox/commit/df7dfe7b79d39010d5addb3fa861a9c803caae45)) +* Added `form_fields_to_bigquery()` method ([#104](https://github.com/googleapis/python-documentai-toolbox/issues/104)) ([96abe22](https://github.com/googleapis/python-documentai-toolbox/commit/96abe220c9909bcc5642ea146c06fd082a2f8009)) + ## [0.5.0-alpha](https://github.com/googleapis/python-documentai-toolbox/compare/v0.4.1-alpha...v0.5.0-alpha) (2023-04-07) diff --git a/google/cloud/documentai_toolbox/converters/config/converter_helpers.py b/google/cloud/documentai_toolbox/converters/config/converter_helpers.py index 9a83ddb8..35ec5af1 100644 --- a/google/cloud/documentai_toolbox/converters/config/converter_helpers.py +++ b/google/cloud/documentai_toolbox/converters/config/converter_helpers.py @@ -214,6 +214,7 @@ def _get_bytes( annotation_file_prefix: str, config_file_prefix: str, config_path: str = None, + storage_client: storage.Client = None, ) -> List[bytes]: r"""Downloads documents and returns them as bytes. @@ -233,8 +234,9 @@ def _get_bytes( List[bytes]. """ + if not storage_client: + storage_client = gcs_utilities._get_storage_client(module="get-bytes") - storage_client = gcs_utilities._get_storage_client() bucket = storage_client.bucket(bucket_name=bucket_name) blobs = storage_client.list_blobs(bucket_or_name=bucket_name, prefix=prefix) @@ -273,6 +275,7 @@ def _upload_file( bucket_name: str, output_prefix: str, file: str, + storage_client: storage.Client = None, ) -> None: r"""Uploads the converted docproto to gcs. @@ -288,7 +291,9 @@ def _upload_file( None. """ - storage_client = gcs_utilities._get_storage_client() + if not storage_client: + storage_client = gcs_utilities._get_storage_client(module="upload-file") + bucket = storage_client.bucket(bucket_name) blob = bucket.blob(output_prefix) @@ -301,6 +306,7 @@ def _get_files( input_bucket: str, input_prefix: str, config_path: str = None, + storage_client: storage.Client = None, ): r"""Returns a list of Futures of documents as bytes. @@ -340,6 +346,7 @@ def _get_files( "annotation", "config", config_path, + storage_client, ) downloads.append(download) @@ -399,7 +406,9 @@ def _get_docproto_files( return files, unique_types, did_not_convert -def _upload(files: dict, gcs_output_path: str) -> None: +def _upload( + files: dict, gcs_output_path: str, storage_client: storage.Client = None +) -> None: r"""Upload converted document.proto to gcs location. Args: @@ -440,6 +449,7 @@ def _upload(files: dict, gcs_output_path: str) -> None: output_bucket, f"{output_prefix}/{key}.json", files[key], + storage_client, ) uploads.append(upload) @@ -495,7 +505,7 @@ def _convert_documents_with_config( if file_check: raise ValueError("gcs_prefix cannot contain file types") - storage_client = gcs_utilities._get_storage_client() + storage_client = gcs_utilities._get_storage_client(module="config-converter") blob_list = storage_client.list_blobs(input_bucket, prefix=input_prefix) @@ -504,6 +514,7 @@ def _convert_documents_with_config( input_prefix=input_prefix, input_bucket=input_bucket, config_path=config_path, + storage_client=storage_client, ) f, _ = futures.wait(downloads) @@ -525,7 +536,7 @@ def _convert_documents_with_config( print(f"Did not convert {len(did_not_convert)} documents") print(did_not_convert) - _upload(files, gcs_output_path) + _upload(files, gcs_output_path, storage_client) print("-------- Finished Uploading --------") print("-------- Schema Information --------") diff --git a/google/cloud/documentai_toolbox/utilities/gcs_utilities.py b/google/cloud/documentai_toolbox/utilities/gcs_utilities.py index 306ae415..280d7703 100644 --- a/google/cloud/documentai_toolbox/utilities/gcs_utilities.py +++ b/google/cloud/documentai_toolbox/utilities/gcs_utilities.py @@ -26,21 +26,32 @@ from google.cloud.documentai_toolbox import constants -def _get_storage_client(): - r"""Returns a Storage client with custom user agent header. +def _get_client_info(module: str = None) -> client_info.ClientInfo: + r"""Returns a custom user agent header. Returns: - storage.Client. + client_info.ClientInfo. """ - user_agent = f"{constants.USER_AGENT_PRODUCT}/{documentai_toolbox.__version__}" + client_library_version = documentai_toolbox.__version__ + + if module: + client_library_version = f"{client_library_version}-{module}" - info = client_info.ClientInfo( - client_library_version=documentai_toolbox.__version__, - user_agent=user_agent, + return client_info.ClientInfo( + client_library_version=client_library_version, + user_agent=f"{constants.USER_AGENT_PRODUCT}/{client_library_version}", ) - return storage.Client(client_info=info) + +def _get_storage_client(module: str = None) -> storage.Client: + r"""Returns a Storage client with custom user agent header. + + Returns: + storage.Client. + + """ + return storage.Client(client_info=_get_client_info(module)) def get_bytes(gcs_bucket_name: str, gcs_prefix: str) -> List[bytes]: @@ -62,7 +73,7 @@ def get_bytes(gcs_bucket_name: str, gcs_prefix: str) -> List[bytes]: """ result = [] - storage_client = _get_storage_client() + storage_client = _get_storage_client(module="get-bytes") blob_list = storage_client.list_blobs(gcs_bucket_name, prefix=gcs_prefix) for blob in blob_list: @@ -143,7 +154,7 @@ def list_gcs_document_tree( if file_check is not None: raise ValueError("gcs_prefix cannot contain file types") - storage_client = _get_storage_client() + storage_client = _get_storage_client(module="list-document") blob_list = storage_client.list_blobs(gcs_bucket_name, prefix=gcs_prefix) path_list: Dict[str, List[str]] = {} @@ -227,7 +238,7 @@ def create_batches( f"Batch size must be less than {constants.BATCH_MAX_FILES}. You provided {batch_size}." ) - storage_client = _get_storage_client() + storage_client = _get_storage_client(module="create-batches") blob_list = storage_client.list_blobs(gcs_bucket_name, prefix=gcs_prefix) batches: List[documentai.BatchDocumentsInputConfig] = [] batch: List[documentai.GcsDocument] = [] diff --git a/google/cloud/documentai_toolbox/version.py b/google/cloud/documentai_toolbox/version.py index eba8486d..56d5b174 100644 --- a/google/cloud/documentai_toolbox/version.py +++ b/google/cloud/documentai_toolbox/version.py @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. # -__version__ = "0.5.0-alpha" +__version__ = "0.6.0-alpha" diff --git a/google/cloud/documentai_toolbox/wrappers/document.py b/google/cloud/documentai_toolbox/wrappers/document.py index 89619c72..06ba7eee 100644 --- a/google/cloud/documentai_toolbox/wrappers/document.py +++ b/google/cloud/documentai_toolbox/wrappers/document.py @@ -234,6 +234,106 @@ def _get_batch_process_metadata( return metadata +def _insert_into_dictionary_with_list(dic: Dict, key: str, value: str) -> Dict: + r"""Inserts value into a dictionary that can contain lists. + + Args: + dic (Dict): + Required. The dictionary to insert into. + key (str): + Required. The key to be created or inserted into. + value (str): + Required. The value to be inserted. + + Returns: + Dict: + The dictionary after adding the key value pair. + """ + existing_value = dic.get(key) + + if existing_value: + # For duplicate keys, + # Change Type to a List if not already + if not isinstance(existing_value, list): + existing_value = [existing_value] + + existing_value.append(value) + dic[key] = existing_value + else: + dic[key] = value + + return dic + + +def _bigquery_column_name(input_string: str) -> str: + r"""Converts a string into a BigQuery column name. + https://cloud.google.com/bigquery/docs/schemas#column_names + + Args: + input_string (str): + Required: The string to convert. + Returns: + str + The converted string. + + """ + char_map: Dict[str, str] = { + r":|;|\(|\)|\[|\]|,|\.|\?|\!|\'|\n": "", + r"/| ": "_", + r"#": "num", + r"@": "at", + } + + for key, value in char_map.items(): + input_string = re.sub(key, value, input_string) + + return input_string.lower() + + +def _dict_to_bigquery( + dic: Dict, + dataset_name: str, + table_name: str, + project_id: Optional[str], +) -> bigquery.job.LoadJob: + r"""Loads dictionary to a BigQuery table. + + Args: + dic (Dict): + Required: The dictionary to insert. + dataset_name (str): + Required. Name of the BigQuery dataset. + table_name (str): + Required. Name of the BigQuery table. + project_id (Optional[str]): + Optional. Project ID containing the BigQuery table. If not passed, falls back to the default inferred from the environment. + Returns: + bigquery.job.LoadJob: + The BigQuery LoadJob for adding the dictionary. + + """ + bq_client = bigquery.Client( + project=project_id, client_info=gcs_utilities._get_client_info() + ) + table_ref = bigquery.DatasetReference( + project=project_id, dataset_id=dataset_name + ).table(table_name) + + job_config = bigquery.LoadJobConfig( + schema_update_options=[ + bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION, + bigquery.SchemaUpdateOption.ALLOW_FIELD_RELAXATION, + ], + source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON, + ) + + return bq_client.load_table_from_json( + json_rows=[dic], + destination=table_ref, + job_config=job_config, + ) + + @dataclasses.dataclass class Document: r"""Represents a wrapped `Document`. @@ -476,6 +576,49 @@ def get_form_field_by_name(self, target_field: str) -> List[FormField]: return found_fields + def form_fields_to_dict(self) -> Dict: + r"""Returns Dictionary of form fields in document. + + Returns: + Dict: + The Dict of the form fields indexed by type. + + """ + form_fields_dict: Dict = {} + for p in self.pages: + for form_field in p.form_fields: + field_name = _bigquery_column_name(form_field.field_name) + form_fields_dict = _insert_into_dictionary_with_list( + form_fields_dict, field_name, form_field.field_value + ) + + return form_fields_dict + + def form_fields_to_bigquery( + self, dataset_name: str, table_name: str, project_id: Optional[str] = None + ) -> bigquery.job.LoadJob: + r"""Adds extracted form fields to a BigQuery table. + + Args: + dataset_name (str): + Required. Name of the BigQuery dataset. + table_name (str): + Required. Name of the BigQuery table. + project_id (Optional[str]): + Optional. Project ID containing the BigQuery table. If not passed, falls back to the default inferred from the environment. + Returns: + bigquery.job.LoadJob: + The BigQuery LoadJob for adding the form fields. + + """ + + return _dict_to_bigquery( + self.form_fields_to_dict(), + dataset_name, + table_name, + project_id, + ) + def get_entity_by_type(self, target_type: str) -> List[Entity]: r"""Returns the list of Entities of target_type. @@ -500,20 +643,10 @@ def entities_to_dict(self) -> Dict: """ entities_dict: Dict = {} for entity in self.entities: - entity_type = entity.type_.replace("/", "_") - - existing_entity = entities_dict.get(entity_type) - if not existing_entity: - entities_dict[entity_type] = entity.mention_text - continue - - # For entities that can have multiple (e.g. line_item) - # Change Entity Type to a List - if not isinstance(existing_entity, list): - existing_entity = [existing_entity] - - existing_entity.append(entity.mention_text) - entities_dict[entity_type] = existing_entity + entity_type = _bigquery_column_name(entity.type_) + entities_dict = _insert_into_dictionary_with_list( + entities_dict, entity_type, entity.mention_text + ) return entities_dict @@ -534,23 +667,12 @@ def entities_to_bigquery( The BigQuery LoadJob for adding the entities. """ - bq_client = bigquery.Client(project=project_id) - table_ref = bigquery.DatasetReference( - project=project_id, dataset_id=dataset_name - ).table(table_name) - - job_config = bigquery.LoadJobConfig( - schema_update_options=[ - bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION, - bigquery.SchemaUpdateOption.ALLOW_FIELD_RELAXATION, - ], - source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON, - ) - return bq_client.load_table_from_json( - json_rows=[self.entities_to_dict()], - destination=table_ref, - job_config=job_config, + return _dict_to_bigquery( + self.entities_to_dict(), + dataset_name, + table_name, + project_id, ) def split_pdf(self, pdf_path: str, output_path: str) -> List[str]: diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py index c3116500..9c2d876d 100644 --- a/google/cloud/documentai_toolbox/wrappers/page.py +++ b/google/cloud/documentai_toolbox/wrappers/page.py @@ -120,6 +120,21 @@ def _table_wrapper_from_documentai_table( ) +@dataclasses.dataclass +class Block: + """Represents a wrapped documentai.Document.Page.Block. + + Attributes: + documentai_block (google.cloud.documentai.Document.Page.Block): + Required. The original google.cloud.documentai.Document.Page.Block object. + text (str): + Required. UTF-8 encoded text. + """ + + documentai_block: documentai.Document.Page.Block + text: str + + @dataclasses.dataclass class Paragraph: """Represents a wrapped documentai.Document.Page.Paragraph. @@ -191,6 +206,32 @@ def _text_from_layout(layout: documentai.Document.Page.Layout, text: str) -> str return result_text +def _get_blocks(blocks: List[documentai.Document.Page.Block], text: str) -> List[Block]: + r"""Returns a list of Block. + + Args: + blocks (List[documentai.Document.Page.Block]): + Required. A list of documentai.Document.Page.Block objects. + text (str): + Required. UTF-8 encoded text in reading order + from the document. + Returns: + List[Block]: + A list of Blocks. + """ + result = [] + + for block in blocks: + result.append( + Block( + documentai_block=block, + text=_text_from_layout(layout=block.layout, text=text), + ) + ) + + return result + + def _get_paragraphs( paragraphs: List[documentai.Document.Page.Paragraph], text: str ) -> List[Paragraph]: @@ -339,6 +380,10 @@ class Page: Required. A list of visually detected text paragraphs on the page. A collection of lines that a human would perceive as a paragraph. + blocks (List[Block]): + Required. A list of visually detected text blocks + on the page. A collection of lines that a human + would perceive as a block. tables (List[Table]): Required. A list of visually detected tables on the page. @@ -350,6 +395,7 @@ class Page: form_fields: List[FormField] = dataclasses.field(init=False, repr=False) lines: List[Line] = dataclasses.field(init=False, repr=False) paragraphs: List[Paragraph] = dataclasses.field(init=False, repr=False) + blocks: List[Block] = dataclasses.field(init=False, repr=False) tables: List[Table] = dataclasses.field(init=False, repr=False) def __post_init__(self): @@ -369,4 +415,5 @@ def __post_init__(self): self.paragraphs = _get_paragraphs( paragraphs=self.documentai_page.paragraphs, text=self.text ) + self.blocks = _get_blocks(blocks=self.documentai_page.blocks, text=self.text) self.tables = tables diff --git a/samples/snippets/entities_to_bigquery_sample.py b/samples/snippets/entities_to_bigquery_sample.py index 0709680b..c2397c24 100644 --- a/samples/snippets/entities_to_bigquery_sample.py +++ b/samples/snippets/entities_to_bigquery_sample.py @@ -42,6 +42,11 @@ def entities_to_bigquery_sample( dataset_name=dataset_name, table_name=table_name, project_id=project_id ) + # Also supported: + # job = wrapped_document.form_fields_to_bigquery( + # dataset_name=dataset_name, table_name=table_name, project_id=project_id + # ) + print("Document entities loaded into BigQuery") print(f"Job ID: {job.job_id}") print(f"Table: {job.destination.path}") diff --git a/samples/snippets/quickstart_sample.py b/samples/snippets/quickstart_sample.py index 33ff8c0f..ea59080f 100644 --- a/samples/snippets/quickstart_sample.py +++ b/samples/snippets/quickstart_sample.py @@ -41,6 +41,8 @@ def quickstart_sample(gcs_bucket_name: str, gcs_prefix: str) -> None: for idx, page in enumerate(wrapped_document.pages): print(f"Page {idx}") + for block in page.blocks: + print(block.text) for paragraph in page.paragraphs: print(paragraph.text) diff --git a/samples/snippets/requirements-test.txt b/samples/snippets/requirements-test.txt index c3cf6219..00c37372 100644 --- a/samples/snippets/requirements-test.txt +++ b/samples/snippets/requirements-test.txt @@ -1,3 +1,3 @@ -pytest==7.2.2 -mock==5.0.1 +pytest==7.3.1 +mock==5.0.2 google-cloud-bigquery==3.9.0 diff --git a/tests/unit/test_converter_helpers.py b/tests/unit/test_converter_helpers.py index dfafc285..8ff15686 100644 --- a/tests/unit/test_converter_helpers.py +++ b/tests/unit/test_converter_helpers.py @@ -407,7 +407,7 @@ def test_upload(mock_upload_file): files["document_1"] = "Document" converter_helpers._upload(files, gcs_output_path="gs://output/") - mock_upload_file.assert_called_with("output", "/document_1.json", "Document") + mock_upload_file.assert_called_with("output", "/document_1.json", "Document", None) def test_upload_with_format_error(): diff --git a/tests/unit/test_document.py b/tests/unit/test_document.py index 0b358ab6..94fb78f5 100644 --- a/tests/unit/test_document.py +++ b/tests/unit/test_document.py @@ -186,7 +186,7 @@ def test_get_batch_process_metadata_with_no_metadata(mock_docai): @mock.patch("google.cloud.documentai_toolbox.wrappers.document.documentai") -def test_document_from_batch_process_operation_with_invalid_metadata_type(mock_docai): +def test_get_batch_process_metadata_with_invalid_metadata_type(mock_docai): with pytest.raises( ValueError, match="Operation metadata type is not", @@ -206,6 +206,19 @@ def test_document_from_batch_process_operation_with_invalid_metadata_type(mock_d document._get_batch_process_metadata(location, operation_name) +def test_bigquery_column_name(): + string_map = { + "Phone #:": "phone_num", + "Emergency Contact:": "emergency_contact", + "Marital Status:": "marital_status", + "Are you currently taking any medication? (If yes, please describe):": "are_you_currently_taking_any_medication_if_yes_please_describe", + "Describe your medical concerns (symptoms, diagnoses, etc):": "describe_your_medical_concerns_symptoms_diagnoses_etc", + } + + for key, value in string_map.items(): + assert document._bigquery_column_name(key) == value + + def test_document_from_document_path_with_single_shard(): actual = document.Document.from_document_path( document_path="tests/unit/resources/0/toolbox_invoice_test-0.json" @@ -401,6 +414,43 @@ def test_get_form_field_by_name(get_bytes_form_parser_mock): assert actual[0].field_value == "(906) 917-3486" +def test_form_fields_to_dict(get_bytes_form_parser_mock): + doc = document.Document.from_gcs( + gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0" + ) + actual = doc.form_fields_to_dict() + + get_bytes_form_parser_mock.assert_called_once() + + assert len(actual) == 17 + assert actual.get("address") == "24 Barney Lane" + assert actual.get("city") == "Towaco" + + +@mock.patch("google.cloud.documentai_toolbox.wrappers.document.bigquery") +def test_form_fields_to_bigquery(mock_bigquery, get_bytes_form_parser_mock): + client = mock_bigquery.Client.return_value + + mock_table = mock.Mock() + client.dataset.table.return_value = mock_table + + mock_load_job = mock.Mock() + client.load_table_from_json.return_value = mock_load_job + + doc = document.Document.from_gcs( + gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0" + ) + + actual = doc.form_fields_to_bigquery( + dataset_name="test_dataset", table_name="test_table", project_id="test_project" + ) + + get_bytes_form_parser_mock.assert_called_once() + mock_bigquery.Client.assert_called_once() + + assert actual + + def test_entities_to_dict(get_bytes_single_file_mock): doc = document.Document.from_gcs( gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0" diff --git a/tests/unit/test_page.py b/tests/unit/test_page.py index 4299fc3f..75915aae 100644 --- a/tests/unit/test_page.py +++ b/tests/unit/test_page.py @@ -172,6 +172,15 @@ def test_text_from_element_with_layout(docproto): assert text == "Invoice\n" +def test_get_blocks(docproto): + docproto_blocks = docproto.pages[0].blocks + + blocks = page._get_blocks(blocks=docproto_blocks, text=docproto.text) + + assert len(blocks) == 31 + assert blocks[0].text == "Invoice\n" + + def test_get_paragraphs(docproto): docproto_paragraphs = docproto.pages[0].paragraphs @@ -218,6 +227,13 @@ def test_FormField(): assert form_field.field_value == "Sally Walker" +def test_Block(): + docai_block = documentai.Document.Page.Block() + block = page.Block(documentai_block=docai_block, text="test_block") + + assert block.text == "test_block" + + def test_Paragraph(): docai_paragraph = documentai.Document.Page.Paragraph() paragraph = page.Paragraph( @@ -254,5 +270,7 @@ def test_Page(docproto): assert len(wrapped_page.lines) == 37 assert len(wrapped_page.paragraphs) == 31 + assert len(wrapped_page.blocks) == 31 assert wrapped_page.lines[0].text == "Invoice\n" assert wrapped_page.paragraphs[30].text == "Supplies used for Project Q.\n" + assert wrapped_page.blocks[30].text == "Supplies used for Project Q.\n"