diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml index 7f291dbd..ec696b55 100644 --- a/.github/.OwlBot.lock.yaml +++ b/.github/.OwlBot.lock.yaml @@ -13,5 +13,5 @@ # limitations under the License. docker: image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest - digest: sha256:4f9b3b106ad0beafc2c8a415e3f62c1a0cc23cabea115dbe841b848f581cfe99 -# created: 2023-10-18T20:26:37.410353675Z + digest: sha256:30470597773378105e239b59fce8eb27cc97375580d592699206d17d117143d0 +# created: 2023-11-03T00:57:07.335914631Z diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index e97d89e4..221806ce 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -28,7 +28,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v4 with: - python-version: "3.9" + python-version: "3.10" - name: Install nox run: | python -m pip install --upgrade setuptools pip wheel diff --git a/CHANGELOG.md b/CHANGELOG.md index 72166310..1940f6c2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Changelog +## [0.11.2-alpha](https://github.com/googleapis/python-documentai-toolbox/compare/v0.11.1-alpha...v0.11.2-alpha) (2023-11-07) + + +### Bug Fixes + +* Updates to hOCR Template to follow hOCR Spec ([#195](https://github.com/googleapis/python-documentai-toolbox/issues/195)) ([3f52e82](https://github.com/googleapis/python-documentai-toolbox/commit/3f52e82eaa741cd2c8a08e8398ed6f4b3f65c419)) + ## [0.11.1-alpha](https://github.com/googleapis/python-documentai-toolbox/compare/v0.11.0-alpha...v0.11.1-alpha) (2023-10-23) diff --git a/google/cloud/documentai_toolbox/templates/hocr_document_template.xml.j2 b/google/cloud/documentai_toolbox/templates/hocr_document_template.xml.j2 index 63db0ada..dad071e1 100644 --- a/google/cloud/documentai_toolbox/templates/hocr_document_template.xml.j2 +++ b/google/cloud/documentai_toolbox/templates/hocr_document_template.xml.j2 @@ -6,8 +6,9 @@ + - + {% for page in pages -%} @@ -16,13 +17,13 @@ {% set bidx = loop.index0 -%} {% for paragraph in docai_block.paragraphs -%} {% set paridx = loop.index0 -%} - {% for line in paragraph.lines -%} +

{% for line in paragraph.lines -%} {% set lidx = loop.index0 -%} {{ line.text }}{% for token in line.tokens -%} {% set tidx = loop.index0 -%} {{ token.text }}{% endfor -%} {% endfor -%} - {% endfor -%} +

{% endfor -%}
{% endfor -%} {% endfor -%} diff --git a/google/cloud/documentai_toolbox/version.py b/google/cloud/documentai_toolbox/version.py index b0ddaabf..301f0b58 100644 --- a/google/cloud/documentai_toolbox/version.py +++ b/google/cloud/documentai_toolbox/version.py @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. # -__version__ = "0.11.1-alpha" +__version__ = "0.11.2-alpha" diff --git a/noxfile.py b/noxfile.py index fc49ce9e..779d7921 100644 --- a/noxfile.py +++ b/noxfile.py @@ -301,7 +301,7 @@ def docs(session): ) -@nox.session(python="3.9") +@nox.session(python="3.10") def docfx(session): """Build the docfx yaml files for this library.""" diff --git a/samples/snippets/requirements-test.txt b/samples/snippets/requirements-test.txt index 331425b6..e763bc58 100644 --- a/samples/snippets/requirements-test.txt +++ b/samples/snippets/requirements-test.txt @@ -1,3 +1,3 @@ -pytest==7.4.2 +pytest==7.4.3 mock==5.1.0 -google-cloud-bigquery==3.12.0 +google-cloud-bigquery==3.13.0 diff --git a/samples/snippets/requirements.txt b/samples/snippets/requirements.txt index f02bf7a1..6d2bd72c 100644 --- a/samples/snippets/requirements.txt +++ b/samples/snippets/requirements.txt @@ -1,4 +1,4 @@ -google-cloud-bigquery==3.12.0 -google-cloud-documentai==2.20.1 -google-cloud-storage==2.12.0 -google-cloud-documentai-toolbox==0.10.2a0 +google-cloud-bigquery==3.13.0 +google-cloud-documentai==2.20.2 +google-cloud-storage==2.13.0 +google-cloud-documentai-toolbox==0.11.1a0 diff --git a/samples/snippets/test_convert_document_to_hocr_sample.py b/samples/snippets/test_convert_document_to_hocr_sample.py index e3ed9f2b..776c0b96 100644 --- a/samples/snippets/test_convert_document_to_hocr_sample.py +++ b/samples/snippets/test_convert_document_to_hocr_sample.py @@ -24,7 +24,11 @@ def test_convert_document_to_hocr_sample() -> None: document_path=document_path, document_title=document_title ) - with open("../../tests/unit/resources/toolbox_invoice_test_0_hocr.xml", "r") as f: + with open( + "../../tests/unit/resources/toolbox_invoice_test_0_hocr.xml", + "r", + encoding="utf-8", + ) as f: expected = f.read() assert actual == expected diff --git a/setup.py b/setup.py index 7a29932e..abece197 100644 --- a/setup.py +++ b/setup.py @@ -66,6 +66,7 @@ "immutabledict >= 2.0.0, < 3.0.0dev; python_version<'3.8'", "Pillow >= 9.5.0, < 11.0.0", "Jinja2 >= 3.1.0, <= 4.0.0", + "hocr-spec >= 0.2.0", ), python_requires=">=3.7", classifiers=[ diff --git a/testing/constraints-3.10.txt b/testing/constraints-3.10.txt index c9f0e4bb..25aa22a8 100644 --- a/testing/constraints-3.10.txt +++ b/testing/constraints-3.10.txt @@ -11,3 +11,4 @@ google-cloud-documentai google-cloud-storage numpy pikepdf +hocr-spec diff --git a/testing/constraints-3.11.txt b/testing/constraints-3.11.txt index c9f0e4bb..25aa22a8 100644 --- a/testing/constraints-3.11.txt +++ b/testing/constraints-3.11.txt @@ -11,3 +11,4 @@ google-cloud-documentai google-cloud-storage numpy pikepdf +hocr-spec diff --git a/testing/constraints-3.7.txt b/testing/constraints-3.7.txt index 3c64ab2e..0a9af7ff 100644 --- a/testing/constraints-3.7.txt +++ b/testing/constraints-3.7.txt @@ -14,3 +14,4 @@ google-cloud-documentai==2.20.0 google-cloud-storage==2.7.0 numpy==1.19.5 pikepdf==6.2.9 +hocr-spec==0.2.0 diff --git a/testing/constraints-3.8.txt b/testing/constraints-3.8.txt index ed1905e2..a9d4c497 100644 --- a/testing/constraints-3.8.txt +++ b/testing/constraints-3.8.txt @@ -11,3 +11,4 @@ google-cloud-documentai google-cloud-storage numpy==1.21.6 pikepdf==8.2.3 +hocr-spec diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index c9f0e4bb..25aa22a8 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -11,3 +11,4 @@ google-cloud-documentai google-cloud-storage numpy pikepdf +hocr-spec diff --git a/tests/unit/resources/toolbox_invoice_test_0_hocr.xml b/tests/unit/resources/toolbox_invoice_test_0_hocr.xml index 0cd8e171..4e265f7d 100644 --- a/tests/unit/resources/toolbox_invoice_test_0_hocr.xml +++ b/tests/unit/resources/toolbox_invoice_test_0_hocr.xml @@ -6,84 +6,85 @@ + - + -
Invoice +

Invoice Invoice -DATE: 01/01/1970 +

DATE: 01/01/1970 DATE: 01/01/1970 INVOICE: NO. 001 INVOICE: NO. 001 -FROM: Company ABC +

FROM: Company ABC FROM: Company ABC user@companyabc.com user@companyabc.com -TO: John Doe +

TO: John Doe TO: John Doe johndoe@email.com johndoe@email.com -ADDRESS: 111 Main Street +

ADDRESS: 111 Main Street ADDRESS: 111 Main Street Anytown, USA Anytown, USA -ADDRESS: 222 Main Street +

ADDRESS: 222 Main Street ADDRESS: 222 Main Street Anytown, USA Anytown, USA -TERMS: 6 month contract +

TERMS: 6 month contract TERMS: 6 month contract DUE: 01/01/2025 DUE: 01/01/2025 -Item Description +

Item Description Item Description -Quantity +

Quantity Quantity -Price +

Price Price -Amount +

Amount Amount -Tool A +

Tool A Tool A -500 +

500 500 -$1.00 +

$1.00 $1.00 -$500.00 +

$500.00 $500.00 -Service B +

Service B Service B -1 +

1 1 -$900.00 +

$900.00 $900.00 -$900.00 +

$900.00 $900.00 -Resource C +

Resource C Resource C -50 +

50 50 -$12.00 +

$12.00 $12.00 -$600.00 +

$600.00 $600.00 -Subtotal +

Subtotal Subtotal -$2000.00 +

$2000.00 $2000.00 -Tax +

Tax Tax -$140.00 +

$140.00 $140.00 -BALANCE DUE +

BALANCE DUE BALANCE DUE -$2140.00 +

$2140.00 $2140.00 -NOTES: +

NOTES: NOTES: -Supplies used for Project Q. +

Supplies used for Project Q. Supplies used for Project Q. -

+

\ No newline at end of file diff --git a/tests/unit/test_document.py b/tests/unit/test_document.py index d988e1e0..86366f27 100644 --- a/tests/unit/test_document.py +++ b/tests/unit/test_document.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from io import BytesIO import json import os import shutil @@ -32,6 +33,8 @@ from google.cloud import documentai from google.cloud.documentai_toolbox import document, gcs_utilities +from hocr_spec import HocrValidator + def get_bytes(file_name): result = [] @@ -689,8 +692,15 @@ def test_export_hocr_str(): ) actual_hocr = wrapped_document.export_hocr_str(title="toolbox_invoice_test-0") + assert actual_hocr + validator = HocrValidator(profile="standard") + report = validator.validate(BytesIO(actual_hocr.encode("utf-8")), parse_strict=True) + + assert report.format("bool") - with open("tests/unit/resources/toolbox_invoice_test_0_hocr.xml", "r") as f: + with open( + "tests/unit/resources/toolbox_invoice_test_0_hocr.xml", "r", encoding="utf-8" + ) as f: expected = f.read() assert actual_hocr == expected