From 1d6ff89b5eb19d0b25b2cb083d5f3eae9374392c Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Mon, 24 Jul 2023 13:14:49 -0500 Subject: [PATCH] refactor: Switch block to use jmespath instead of custom format for config --- .../converters/config/block.py | 225 +++++++++--------- .../AWS/AWS-config.json | 24 +- .../Azure/form-config.json | 32 +-- .../Azure/invoice-config.json | 32 +-- setup.py | 2 + .../converters/test_config_type_1.json | 25 +- .../converters/test_config_type_2.json | 27 +-- .../converters/test_config_type_3.json | 32 +-- .../resources/converters/test_type_1.json | 4 +- .../resources/converters/test_type_2.json | 8 +- .../resources/converters/test_type_3.json | 37 ++- 11 files changed, 214 insertions(+), 234 deletions(-) diff --git a/google/cloud/documentai_toolbox/converters/config/block.py b/google/cloud/documentai_toolbox/converters/config/block.py index c7ed9b53..19c0cdef 100644 --- a/google/cloud/documentai_toolbox/converters/config/block.py +++ b/google/cloud/documentai_toolbox/converters/config/block.py @@ -15,10 +15,13 @@ # import dataclasses + import json from types import SimpleNamespace from typing import List, Optional, Type +import jmespath + from google.cloud import documentai @@ -59,9 +62,7 @@ class Block: Attributes: bounding_box (str): Required. - block_references: - Optional. - block_id: + id_: Optional. confidence: Optional. @@ -72,15 +73,15 @@ class Block: page_number: Optional. """ - type_: SimpleNamespace = dataclasses.field(init=True, repr=False) - text: SimpleNamespace = dataclasses.field(init=True, repr=False) + type_: str = dataclasses.field(init=True, repr=False) + text: str = dataclasses.field(init=True, repr=False) + docproto_width: float = dataclasses.field(init=False, repr=False) + docproto_height: float = dataclasses.field(init=False, repr=False) + bounding_box: Optional[SimpleNamespace] = dataclasses.field( init=True, repr=False, default=None ) - block_references: Optional[SimpleNamespace] = dataclasses.field( - init=True, repr=False, default=None - ) - block_id: Optional[SimpleNamespace] = dataclasses.field( + id_: Optional[SimpleNamespace] = dataclasses.field( init=False, repr=False, default=None ) confidence: Optional[SimpleNamespace] = dataclasses.field( @@ -101,10 +102,10 @@ class Block: bounding_height: Optional[SimpleNamespace] = dataclasses.field( init=False, repr=False, default=None ) - bounding_type: Optional[SimpleNamespace] = dataclasses.field( + bounding_type: Optional[int] = dataclasses.field( init=False, repr=False, default=None ) - bounding_unit: Optional[SimpleNamespace] = dataclasses.field( + bounding_unit: Optional[str] = dataclasses.field( init=False, repr=False, default=None ) bounding_x: Optional[SimpleNamespace] = dataclasses.field( @@ -113,12 +114,6 @@ class Block: bounding_y: Optional[SimpleNamespace] = dataclasses.field( init=False, repr=False, default=None ) - docproto_width: Optional[float] = dataclasses.field( - init=False, repr=False, default=None - ) - docproto_height: Optional[float] = dataclasses.field( - init=False, repr=False, default=None - ) @classmethod def load_blocks_from_schema( @@ -142,98 +137,104 @@ def load_blocks_from_schema( From original annotation data and provided config. """ - objects = json.loads(input_data) - schema_json = json.loads( - input_config, object_hook=lambda d: SimpleNamespace(**d) - ) - - entities = schema_json.entity_object - type_ = schema_json.entity.type_ - - mention_text = schema_json.entity.mention_text - - id_ = getattr(schema_json.entity, "id", None) - document_height = ( - getattr(schema_json.page, "height", None) - if hasattr(schema_json, "page") - else None - ) - document_width = ( - getattr(schema_json.page, "width", None) - if hasattr(schema_json, "page") - else None - ) - - confidence = getattr(schema_json.entity, "confidence", None) - page_number = getattr(schema_json.entity, "page_number", None) - normalized_vertices = getattr( - schema_json.entity.normalized_vertices, "base", None - ) - bounding_width = getattr(schema_json.entity.normalized_vertices, "width", None) - bounding_height = getattr( - schema_json.entity.normalized_vertices, "height", None - ) - bounding_type = getattr(schema_json.entity.normalized_vertices, "type", None) - bounding_unit = getattr(schema_json.entity.normalized_vertices, "unit", None) - bounding_x = getattr(schema_json.entity.normalized_vertices, "x", None) - bounding_y = getattr(schema_json.entity.normalized_vertices, "y", None) - - blocks: List[Block] = [] - ens = _get_target_object(objects, entities) - for i in ens: - entity = i - - block_text = "" - - if type_ == f"{entities}:self": - block_type = i - entity = _get_target_object(objects, f"{entities}.{i}") - else: - block_type = _get_target_object(entity, type_) - - if "||" in mention_text: - text_commands = mention_text.split("||") - for command in text_commands: - if command in entity: - block_text = _get_target_object(entity, command) - continue - else: - block_text = _get_target_object(entity, mention_text) - - b = Block( - type_=block_type, - text=block_text, - bounding_box=_get_target_object(entity, normalized_vertices), - ) - - if id_: - b.id_ = _get_target_object(entity, id_) - if confidence: - b.confidence = _get_target_object(entity, confidence) - if page_number and page_number in entity: - b.page_number = _get_target_object(entity, page_number) - if bounding_width: - b.bounding_width = _get_target_object(b.bounding_box, bounding_width) - if bounding_height: - b.bounding_height = _get_target_object(b.bounding_box, bounding_height) - if document_height: - b.page_height = _get_target_object(objects, document_height) - if document_width: - b.page_width = _get_target_object(objects, document_width) - if bounding_type: - b.bounding_type = bounding_type - if bounding_unit: - b.bounding_unit = bounding_unit - if bounding_x: - b.bounding_x = bounding_x - if bounding_y: - b.bounding_y = bounding_y - - if b.page_number is None: - b.page_number = 0 - - b.docproto_width = base_docproto.pages[int(b.page_number)].dimension.width - b.docproto_height = base_docproto.pages[int(b.page_number)].dimension.height - - blocks.append(b) - return blocks + annotations = json.loads(input_data) + schema = json.loads(input_config) + + data_class_fields = { + field_name: jmespath.search(jmespath_expr, annotations) + for field_name, jmespath_expr in schema.items() + } + b = Block(**data_class_fields) + + print(b) + + # entities = schema_json.entity_object + # type_ = schema_json.entity.type_ + + # mention_text = schema_json.entity.mention_text + + # id_ = getattr(schema_json.entity, "id", None) + # document_height = ( + # getattr(schema_json.page, "height", None) + # if hasattr(schema_json, "page") + # else None + # ) + # document_width = ( + # getattr(schema_json.page, "width", None) + # if hasattr(schema_json, "page") + # else None + # ) + + # confidence = getattr(schema_json.entity, "confidence", None) + # page_number = getattr(schema_json.entity, "page_number", None) + # normalized_vertices = getattr( + # schema_json.entity.normalized_vertices, "base", None + # ) + # bounding_width = getattr(schema_json.entity.normalized_vertices, "width", None) + # bounding_height = getattr( + # schema_json.entity.normalized_vertices, "height", None + # ) + # bounding_type = getattr(schema_json.entity.normalized_vertices, "type", None) + # bounding_unit = getattr(schema_json.entity.normalized_vertices, "unit", None) + # bounding_x = getattr(schema_json.entity.normalized_vertices, "x", None) + # bounding_y = getattr(schema_json.entity.normalized_vertices, "y", None) + + # blocks: List[Block] = [] + # ens = _get_target_object(objects, entities) + # for i in ens: + # entity = i + + # block_text = "" + + # if type_ == f"{entities}:self": + # block_type = i + # entity = _get_target_object(objects, f"{entities}.{i}") + # else: + # block_type = _get_target_object(entity, type_) + + # if "||" in mention_text: + # text_commands = mention_text.split("||") + # for command in text_commands: + # if command in entity: + # block_text = _get_target_object(entity, command) + # continue + # else: + # block_text = _get_target_object(entity, mention_text) + + # b = Block( + # type_=block_type, + # text=block_text, + # bounding_box=_get_target_object(entity, normalized_vertices), + # ) + + # if id_: + # b.id_ = _get_target_object(entity, id_) + # if confidence: + # b.confidence = _get_target_object(entity, confidence) + # if page_number and page_number in entity: + # b.page_number = _get_target_object(entity, page_number) + # if bounding_width: + # b.bounding_width = _get_target_object(b.bounding_box, bounding_width) + # if bounding_height: + # b.bounding_height = _get_target_object(b.bounding_box, bounding_height) + # if document_height: + # b.page_height = _get_target_object(objects, document_height) + # if document_width: + # b.page_width = _get_target_object(objects, document_width) + # if bounding_type: + # b.bounding_type = bounding_type + # if bounding_unit: + # b.bounding_unit = bounding_unit + # if bounding_x: + # b.bounding_x = bounding_x + # if bounding_y: + # b.bounding_y = bounding_y + + # if b.page_number is None: + # b.page_number = 0 + + # b.docproto_width = base_docproto.pages[int(b.page_number)].dimension.width + # b.docproto_height = base_docproto.pages[int(b.page_number)].dimension.height + + # blocks.append(b) + # return blocks diff --git a/samples/sample-converter-configs/AWS/AWS-config.json b/samples/sample-converter-configs/AWS/AWS-config.json index d33c1d50..3f43582a 100644 --- a/samples/sample-converter-configs/AWS/AWS-config.json +++ b/samples/sample-converter-configs/AWS/AWS-config.json @@ -1,14 +1,14 @@ { - "entity_object":"Blocks", - "entity": { - "type_":"BlockType", - "mention_text":"Text", - "normalized_vertices":{ - "type":"1", - "unit":"normalized", - "base":"Geometry.Polygon", - "x":"X", - "y":"Y" - } + "entity_object": "Blocks", + "entity": { + "type_": "BlockType", + "mention_text": "Text", + "normalized_vertices": { + "type": 1, + "unit": "normalized", + "base": "Geometry.Polygon", + "x": "X", + "y": "Y" } -} \ No newline at end of file + } +} diff --git a/samples/sample-converter-configs/Azure/form-config.json b/samples/sample-converter-configs/Azure/form-config.json index 19749112..fbc6dc92 100644 --- a/samples/sample-converter-configs/Azure/form-config.json +++ b/samples/sample-converter-configs/Azure/form-config.json @@ -1,18 +1,18 @@ { - "entity_object":"analyzeResult.pageResults.0.keyValuePairs", - "page": { - "height":"analyzeResult.readResults.0.height", - "width":"analyzeResult.readResults.0.width" - }, - "entity": { - "type_":"key.text", - "mention_text":"value.text", - "normalized_vertices":{ - "type":"3", - "unit":"inch", - "base":"key.boundingBox", - "x":"x", - "y":"y" - } + "entity_object": "analyzeResult.pageResults[0].keyValuePairs", + "page": { + "height": "analyzeResult.readResults[0].height", + "width": "analyzeResult.readResults[0].width" + }, + "entity": { + "type_": "key.text", + "mention_text": "value.text", + "normalized_vertices": { + "type": 3, + "unit": "inch", + "base": "key.boundingBox", + "x": "x", + "y": "y" } -} \ No newline at end of file + } +} diff --git a/samples/sample-converter-configs/Azure/invoice-config.json b/samples/sample-converter-configs/Azure/invoice-config.json index 3ec3468e..a4c33947 100644 --- a/samples/sample-converter-configs/Azure/invoice-config.json +++ b/samples/sample-converter-configs/Azure/invoice-config.json @@ -1,18 +1,18 @@ { - "entity_object":"analyzeResult.documentResults.0.fields", - "page": { - "height":"analyzeResult.readResults.0.height", - "width":"analyzeResult.readResults.0.width" - }, - "entity": { - "type_":"analyzeResult.documentResults.0.fields:self", - "mention_text":"text", - "normalized_vertices":{ - "type":"3", - "unit":"pxl", - "base":"boundingBox", - "x":"x", - "y":"y" - } + "entity_object": "analyzeResult.documentResults.0.fields", + "page": { + "height": "analyzeResult.readResults.0.height", + "width": "analyzeResult.readResults.0.width" + }, + "entity": { + "type_": "analyzeResult.documentResults.0.fields:self", + "mention_text": "text", + "normalized_vertices": { + "type": 3, + "unit": "pxl", + "base": "boundingBox", + "x": "x", + "y": "y" } -} \ No newline at end of file + } +} diff --git a/setup.py b/setup.py index 98f25e8b..15e57a1f 100644 --- a/setup.py +++ b/setup.py @@ -61,6 +61,8 @@ "immutabledict >= 2.0.0, < 3.0.0dev", "Pillow >= 9.5.0, < 10.0.0", "Jinja2 >= 3.1.0, <= 3.1.2", + "dataclasses-json >= 0.5.13, < 1.0.0", + "jmespath >= 1.0.1, < 2.0.0", ), python_requires=">=3.7", classifiers=[ diff --git a/tests/unit/resources/converters/test_config_type_1.json b/tests/unit/resources/converters/test_config_type_1.json index 554bdaf9..d8e09ecc 100644 --- a/tests/unit/resources/converters/test_config_type_1.json +++ b/tests/unit/resources/converters/test_config_type_1.json @@ -1,15 +1,12 @@ { - "entity_object":"pages.1.Entities", - "entity": { - "mention_text":"Text", - "type_":"Type", - "page_number": "page", - "normalized_vertices":{ - "type":"1", - "unit":"inch", - "base":"bBox", - "x":"x", - "y":"y" - } - } -} \ No newline at end of file + "entities": "pages[1:].Entities", + "mention_text": "pages[1:].Entities.Text", + "type_": "pages[1:].Entities.Type", + "page_number": "pages[1:].Entities.page", + "confidence": "pages[1:].Entities.page.confidence", + "bbox_type": 1, + "unit": "inch", + "normalized_vertices" : "pages[1:].Entities.bBox", + "x": "pages[1:].Entities.bBox[*].x", + "y": "pages[1:].Entities.bBox[*].y" +} diff --git a/tests/unit/resources/converters/test_config_type_2.json b/tests/unit/resources/converters/test_config_type_2.json index 5403f0f5..7071586f 100644 --- a/tests/unit/resources/converters/test_config_type_2.json +++ b/tests/unit/resources/converters/test_config_type_2.json @@ -1,16 +1,13 @@ { - "entity_object":"document.entities", - "entity": { - "type_":"type", - "mention_text":"mentionText", - "normalized_vertices":{ - "type":"2", - "unit":"normalized", - "base":"pageAnchor.pageRefs.0.boundingPoly.normalizedVertices", - "x":"left", - "y":"top", - "width":"width", - "height":"height" - } - } -} \ No newline at end of file + "entities":"document.entities", + "mention_text": "document.entities[*].mentionText", + "type_": "document.entities[*].type", + "confidence": "document.entities[*].confidence", + "bbox_type": 2, + "unit": "normalized", + "normalized_vertices" : "document.entities[*].pageAnchor.pageRefs[*].boundingPoly.normalizedVertices", + "x": "document.entities[*].pageAnchor.pageRefs[*].boundingPoly.normalizedVertices.left", + "y": "document.entities[*].pageAnchor.pageRefs[*].boundingPoly.normalizedVertices.top", + "width": "document.entities[*].pageAnchor.pageRefs[*].boundingPoly.normalizedVertices.width", + "height": "document.entities[*].pageAnchor.pageRefs[*].boundingPoly.normalizedVertices.height" +} diff --git a/tests/unit/resources/converters/test_config_type_3.json b/tests/unit/resources/converters/test_config_type_3.json index 8b45f0bd..4c10637b 100644 --- a/tests/unit/resources/converters/test_config_type_3.json +++ b/tests/unit/resources/converters/test_config_type_3.json @@ -1,21 +1,13 @@ { - "entity_object":"Entities", - "page": { - "height":"page_height", - "width":"page_width" - }, - "entity": { - "type_":"Entities:self", - "mention_text":"Text||normalizedText", - "normalized_vertices":{ - "type":"3", - "unit":"pxl", - "base":"bBox", - "x":"x", - "y":"y" - }, - "id":"id", - "confidence":"confidence", - "page_number":"page" - } -} \ No newline at end of file + "page_height": "page_height", + "page_width": "page_width", + "entities": "Entities", + "type_": "Entities | keys(@)", + "confidence": "Entities.{key: @, value: {nested_key: @.confidence}}", + "page_number": "Entities.{key: @, value: {nested_key: @.page}}", + "mention_text": "Entities.{key: @, value: {nested_key: @.Text}}", + "normalized_text": "Entities.{key: @, value: {nested_key: @.normalizedText}}", + "bbox_type": 3, + "unit": "pxl", + "normalized_vertices": "Entities.{key: @, value: {nested_key: @.bBox}}" +} diff --git a/tests/unit/resources/converters/test_type_1.json b/tests/unit/resources/converters/test_type_1.json index 5fd4ef28..5c1ae78d 100644 --- a/tests/unit/resources/converters/test_type_1.json +++ b/tests/unit/resources/converters/test_type_1.json @@ -8,7 +8,7 @@ { "Type": "BusinessName", "Text": "411 I.T. Group", - "id":0, + "id": 0, "bBox": [ { "x": 4.083333, @@ -33,4 +33,4 @@ ] } ] -} \ No newline at end of file +} diff --git a/tests/unit/resources/converters/test_type_2.json b/tests/unit/resources/converters/test_type_2.json index 5ed2411b..be548fe7 100644 --- a/tests/unit/resources/converters/test_type_2.json +++ b/tests/unit/resources/converters/test_type_2.json @@ -2,8 +2,8 @@ "document": { "uri": "", "mimeType": "application/pdf", - "page_height":1000, - "page_width":1000, + "page_height": 1000, + "page_width": 1000, "entities": [ { "type": "invoice_id", @@ -15,7 +15,7 @@ "boundingPoly": { "normalizedVertices": { "width": 0.03, - "height":0.01, + "height": 0.01, "left": 0.07906712, "top": 0.36043957 } @@ -27,4 +27,4 @@ } ] } -} \ No newline at end of file +} diff --git a/tests/unit/resources/converters/test_type_3.json b/tests/unit/resources/converters/test_type_3.json index ebba08cd..83806cb4 100644 --- a/tests/unit/resources/converters/test_type_3.json +++ b/tests/unit/resources/converters/test_type_3.json @@ -1,25 +1,16 @@ { - "DocumentType": "ScannedPDF", - "NoOfPages": 1, - "page_height":1000, - "page_width":1000, - "Entities": { - "BusinessName": { - "Text": "411 I.T. Group", - "normalizedText":"normalized 411 I.T. Group", - "id":0, - "bBox": [ - 392, - 116, - 558, - 116, - 558, - 145, - 392, - 145 - ], - "page": "0", - "confidence": 0.9997831 - } + "DocumentType": "ScannedPDF", + "NoOfPages": 1, + "page_height": 1000, + "page_width": 1000, + "Entities": { + "BusinessName": { + "Text": "411 I.T. Group", + "normalizedText": "normalized 411 I.T. Group", + "id": 0, + "bBox": [392, 116, 558, 116, 558, 145, 392, 145], + "page": "0", + "confidence": 0.9997831 } - } \ No newline at end of file + } +}