diff --git a/.gitignore b/.gitignore index b4243ced..f7ae0606 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,7 @@ pip-log.txt .nox .cache .pytest_cache +.mypy_cache/ # Mac diff --git a/.repo-metadata.json b/.repo-metadata.json index 51750feb..b2673951 100644 --- a/.repo-metadata.json +++ b/.repo-metadata.json @@ -2,7 +2,7 @@ "name": "documentai-toolbox", "name_pretty": "Document AI Toolbox", "issue_tracker": "https://github.com/googleapis/python-documentai-toolbox/issues", - "client_documentation": "", + "client_documentation": "https://cloud.google.com/python/docs/reference/documentai-toolbox/latest", "release_level": "preview", "language": "python", "library_type": "OTHER", diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b78004f..c67e1bc6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,17 @@ # Changelog +## [0.1.1-alpha](https://github.com/googleapis/python-documentai-toolbox/compare/v0.1.0-alpha...v0.1.1-alpha) (2023-02-08) + + +### Bug Fixes + +* Updated Pip install name in README ([#52](https://github.com/googleapis/python-documentai-toolbox/issues/52)) ([dad8c8b](https://github.com/googleapis/python-documentai-toolbox/commit/dad8c8bfb6241eaa1e24f0b239d39d1396c735c8)) + + +### Documentation + +* **samples:** Added quickstart sample ([#27](https://github.com/googleapis/python-documentai-toolbox/issues/27)) ([23a0791](https://github.com/googleapis/python-documentai-toolbox/commit/23a0791633b0c2c2fb65f3706ecb279d058239ad)) + ## [0.1.0-alpha](https://github.com/googleapis/python-documentai-toolbox/compare/v0.1.0-alpha...v0.1.0-alpha) (2023-01-31) diff --git a/README.rst b/README.rst index f02ecf37..499d3426 100644 --- a/README.rst +++ b/README.rst @@ -1,17 +1,14 @@ ----- **Disclaimer** The Document AI Toolbox is in an experimental state. This library is a work-in-progress and is likely to have backwards-incompatible changes. Users of the toolbox might need to rewrite their code when upgrading the toolbox version. ----- - Document AI Toolbox ================================= |experimental| |versions| -`Document AI Toolbox`_: Document AI Toolbox aims to reduce the friction of managing, manipulating, and extracting information from outputs of Document AI’s BatchProcessDocument method (JSON files written to user-managed GCS buckets), programmatically. +`Document AI Toolbox`_: Document AI Toolbox aims to reduce the friction of managing, manipulating, and extracting information from outputs of Document AI’s BatchProcessDocument (JSON files written to user-managed GCS buckets) and ProcessDocument methods, programmatically. .. |experimental| image:: https://img.shields.io/badge/support-experimental-red.svg :target: https://github.com/googleapis/google-cloud-python/blob/main/README.rst#stability-levels @@ -86,7 +83,7 @@ Mac/Linux pip install virtualenv virtualenv source /bin/activate - /bin/pip install documentai-toolbox + /bin/pip install google-cloud-documentai-toolbox Windows @@ -97,7 +94,7 @@ Windows pip install virtualenv virtualenv \Scripts\activate - \Scripts\pip.exe install documentai-toolbox + \Scripts\pip.exe install google-cloud-documentai-toolbox Next Steps ~~~~~~~~~~ @@ -109,7 +106,7 @@ Next Steps - View this `README`_ to see the full list of Cloud APIs that we cover. -.. _`Client Library Documentation`: https://cloud.google.com/python/docs/reference/documentai/latest +.. _`Client Library Documentation`: https://cloud.google.com/python/docs/reference/documentai-toolbox/latest .. _`Document AI documentation`: https://cloud.google.com/document-ai .. _`Document AI Product documentation`: https://cloud.google.com/document-ai/docs/overview .. _`README`: https://github.com/googleapis/python-documentai-toolbox/blob/main/README.rst diff --git a/docs/README.rst b/docs/README.rst deleted file mode 100644 index d712f5f1..00000000 --- a/docs/README.rst +++ /dev/null @@ -1,105 +0,0 @@ -Document AI Toolbox -================================= - -|experimental| |versions| - -`Document AI Toolbox`_: Service to parse structured information from unstructured or semi-structured documents using state-of-the-art Google AI such as natural language, computer vision, translation, and AutoML. - -- `SDK Documentation`_ - -.. |experimental| image:: https://img.shields.io/badge/support-experimental-red.svg - :target: https://github.com/googleapis/google-cloud-python/blob/main/README.rst#stability-levels -.. |versions| image:: https://img.shields.io/pypi/pyversions/google-analytics-admin.svg - :target: https://pypi.org/project/google-analytics-admin/ - -.. _Document AI Toolbox: LINK -.. _SDK Documentation: LINK - -Quick Start ------------ - -In order to use this library, you first need to go through the following steps: - -1. `Select or create a Cloud Platform project.`_ -2. `Enable billing for your project.`_ -3. `Enable the Document AI API.`_ -4. `Setup Authentication.`_ - -.. _Select or create a Cloud Platform project.: https://console.cloud.google.com/project -.. _Enable billing for your project.: https://cloud.google.com/billing/docs/how-to/modify-project#enable_billing_for_a_project -.. _Enable the Document AI API.: https://cloud.google.com/document-ai/docs -.. _Setup Authentication.: https://googleapis.dev/python/google-api-core/latest/auth.html - -Installation -~~~~~~~~~~~~ - -Install this library in a `virtualenv`_ using pip. `virtualenv`_ is a tool to -create isolated Python environments. The basic problem it addresses is one of -dependencies and versions, and indirectly permissions. - -With `virtualenv`_, it's possible to install this library without needing system -install permissions, and without clashing with the installed system -dependencies. - -.. _`virtualenv`: https://virtualenv.pypa.io/en/latest/ - - -Code samples and snippets -~~~~~~~~~~~~~~~~~~~~~~~~~ - -Code samples and snippets live in the `samples/` folder. - - -Supported Python Versions -^^^^^^^^^^^^^^^^^^^^^^^^^ -Our client libraries are compatible with all current `active`_ and `maintenance`_ versions of -Python. - -Python >= 3.7 - -.. _active: https://devguide.python.org/devcycle/#in-development-main-branch -.. _maintenance: https://devguide.python.org/devcycle/#maintenance-branches - -Unsupported Python Versions -^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Python <= 3.6 - -If you are using an `end-of-life`_ -version of Python, we recommend that you update as soon as possible to an actively supported version. - -.. _end-of-life: https://devguide.python.org/devcycle/#end-of-life-branches - -Mac/Linux -^^^^^^^^^ - -.. code-block:: console - - pip install virtualenv - virtualenv - source /bin/activate - /bin/pip install documentai-toolbox - - -Windows -^^^^^^^ - -.. code-block:: console - - pip install virtualenv - virtualenv - \Scripts\activate - \Scripts\pip.exe install documentai-toolbox - -Next Steps -~~~~~~~~~~ - -- Read the `Client Library Documentation`_ for Document AI Toolbox - to see other available methods on the client. -- Read the `Document AI API Product documentation`_ to learn - more about the product and see How-to Guides. -- View this `README`_ to see the full list of Cloud - APIs that we cover. - -.. _Client Library Documentation: LINK -.. _Document AI API Product documentation: LINK -.. _README: https://github.com/googleapis/python-documentai-toolbox/blob/main/README.rst \ No newline at end of file diff --git a/docs/README.rst b/docs/README.rst new file mode 120000 index 00000000..89a01069 --- /dev/null +++ b/docs/README.rst @@ -0,0 +1 @@ +../README.rst \ No newline at end of file diff --git a/google/cloud/documentai_toolbox/constants.py b/google/cloud/documentai_toolbox/constants.py index b5b91e88..caa71098 100644 --- a/google/cloud/documentai_toolbox/constants.py +++ b/google/cloud/documentai_toolbox/constants.py @@ -15,3 +15,8 @@ # USER_AGENT_PRODUCT = "documentai-toolbox" + +JSON_EXTENSION = ".json" +JSON_MIMETYPE = "application/json" + +FILE_CHECK_REGEX = r"(.*[.].*$)" diff --git a/google/cloud/documentai_toolbox/version.py b/google/cloud/documentai_toolbox/version.py index d1ae123d..903642a6 100644 --- a/google/cloud/documentai_toolbox/version.py +++ b/google/cloud/documentai_toolbox/version.py @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. # -__version__ = "0.1.0-alpha" +__version__ = "0.1.1-alpha" diff --git a/google/cloud/documentai_toolbox/wrappers/document.py b/google/cloud/documentai_toolbox/wrappers/document.py index f369c9ed..69b78743 100644 --- a/google/cloud/documentai_toolbox/wrappers/document.py +++ b/google/cloud/documentai_toolbox/wrappers/document.py @@ -16,8 +16,9 @@ """Wrappers for Document AI Document type.""" import dataclasses +import os import re -from typing import List, Optional +from typing import Dict, List, Optional from google.api_core import client_info from google.cloud import documentai @@ -93,13 +94,11 @@ def _get_bytes(gcs_bucket_name: str, gcs_prefix: str) -> List[bytes]: gcs_bucket_name (str): Required. The name of the gcs bucket. - Format: gs://{bucket}/{optional_folder}/{target_folder}/ - where gcs_bucket_name={bucket} . + Format: `gs://bucket/optional_folder/target_folder/` where gcs_bucket_name=`bucket`. gcs_prefix (str): Required. The prefix of the json files in the target_folder - Format: gs://{bucket}/{optional_folder}/{target_folder}/ - where gcs_prefix={optional_folder}/{target_folder}/ . + Format: `gs://bucket/optional_folder/target_folder/` where gcs_prefix=`optional_folder/target_folder`. Returns: List[bytes]: A list of bytes. @@ -111,9 +110,11 @@ def _get_bytes(gcs_bucket_name: str, gcs_prefix: str) -> List[bytes]: blob_list = storage_client.list_blobs(gcs_bucket_name, prefix=gcs_prefix) for blob in blob_list: - if blob.name.endswith(".json"): - blob_as_bytes = blob.download_as_bytes() - result.append(blob_as_bytes) + if ( + blob.name.endswith(constants.JSON_EXTENSION) + or blob.content_type == constants.JSON_MIMETYPE + ): + result.append(blob.download_as_bytes()) return result @@ -125,13 +126,11 @@ def _get_shards(gcs_bucket_name: str, gcs_prefix: str) -> List[documentai.Docume gcs_bucket_name (str): Required. The name of the gcs bucket. - Format: gs://{bucket}/{optional_folder}/{target_folder}/ - where gcs_bucket_name={bucket}. + Format: `gs://bucket/optional_folder/target_folder/` where gcs_bucket_name=`bucket`. gcs_prefix (str): Required. The prefix of the json files in the target_folder. - Format: gs://{bucket}/{optional_folder}/{target_folder}/ - where gcs_prefix={optional_folder}/{target_folder}/. + Format: `gs://bucket/optional_folder/target_folder/` where gcs_prefix=`optional_folder/target_folder`. Returns: List[google.cloud.documentai.Document]: A list of documentai.Documents. @@ -139,7 +138,7 @@ def _get_shards(gcs_bucket_name: str, gcs_prefix: str) -> List[documentai.Docume """ shards = [] - file_check = re.match(r"(.*[.].*$)", gcs_prefix) + file_check = re.match(constants.FILE_CHECK_REGEX, gcs_prefix) if file_check is not None: raise ValueError("gcs_prefix cannot contain file types") @@ -147,7 +146,7 @@ def _get_shards(gcs_bucket_name: str, gcs_prefix: str) -> List[documentai.Docume byte_array = _get_bytes(gcs_bucket_name, gcs_prefix) for byte in byte_array: - shards.append(documentai.Document.from_json(byte)) + shards.append(documentai.Document.from_json(byte, ignore_unknown_fields=True)) return shards @@ -159,21 +158,20 @@ def print_gcs_document_tree(gcs_bucket_name: str, gcs_prefix: str) -> None: gcs_bucket_name (str): Required. The name of the gcs bucket. - Format: gs://{bucket}/{optional_folder}/{target_folder}/ - where gcs_bucket_name={bucket}. + Format: `gs://bucket/optional_folder/target_folder/` where gcs_bucket_name=`bucket`. gcs_prefix (str): Required. The prefix of the json files in the target_folder. - Format: gs://{bucket}/{optional_folder}/{target_folder}/ - where gcs_prefix={optional_folder}/{target_folder}/ . + Format: `gs://bucket/optional_folder/target_folder/` where gcs_prefix=`optional_folder/target_folder`. Returns: None. """ - display_filename_prefix_middle = "├──" - display_filename_prefix_last = "└──" + FILENAME_TREE_MIDDLE = "├──" + FILENAME_TREE_LAST = "└──" + FILES_TO_DISPLAY = 4 - file_check = re.match(r"(.*[.].*$)", gcs_prefix) + file_check = re.match(constants.FILE_CHECK_REGEX, gcs_prefix) if file_check is not None: raise ValueError("gcs_prefix cannot contain file types") @@ -181,34 +179,26 @@ def print_gcs_document_tree(gcs_bucket_name: str, gcs_prefix: str) -> None: storage_client = _get_storage_client() blob_list = storage_client.list_blobs(gcs_bucket_name, prefix=gcs_prefix) - path_list = {} + path_list: Dict[str, List[str]] = {} for blob in blob_list: - file_path = blob.name.split("/") - file_name = file_path.pop() - - file_path2 = "/".join(file_path) + directory, file_name = os.path.split(blob.name) - if file_path2 in path_list: - path_list[file_path2] += f"{file_name}," + if directory in path_list: + path_list[directory].append(file_name) else: - path_list[file_path2] = f"{file_name}," - - for key in path_list: - a = path_list[key].split(",") - a.pop() - print(f"{key}") - togo = 4 - for idx, val in enumerate(a): - if idx == len(a) - 1: - if len(a) > 4: + path_list[directory] = [file_name] + + for directory, files in path_list.items(): + print(f"{directory}") + dir_size = len(files) + for idx, file_name in enumerate(files): + if idx == dir_size - 1: + if dir_size > FILES_TO_DISPLAY: print("│ ....") - print(f"{display_filename_prefix_last}{val}\n") - elif len(a) > 4 and togo != -1: - togo -= 1 - print(f"{display_filename_prefix_middle}{val}") - elif len(a) <= 4: - print(f"{display_filename_prefix_middle}{val}") + print(f"{FILENAME_TREE_LAST}{file_name}\n") + elif idx <= FILES_TO_DISPLAY: + print(f"{FILENAME_TREE_MIDDLE}{file_name}") @dataclasses.dataclass @@ -227,13 +217,11 @@ class Document: gcs_bucket_name (Optional[str]): Optional. The name of the gcs bucket. - Format: gs://{bucket}/{optional_folder}/{target_folder}/ - where gcs_bucket_name={bucket}. + Format: `gs://bucket/optional_folder/target_folder/` where gcs_bucket_name=`bucket`. gcs_prefix (Optional[str]): Optional. The prefix of the json files in the target_folder. - Format: gs://{bucket}/{optional_folder}/{target_folder}/ - where gcs_prefix={optional_folder}/{target_folder}/. + Format: `gs://bucket/optional_folder/target_folder/` where gcs_prefix=`optional_folder/target_folder`. For more information please take a look at https://cloud.google.com/storage/docs/json_api/v1/objects/list . pages: (List[Page]): @@ -262,14 +250,14 @@ def from_document_path( Args: document_path (str): - Required. The path to the resp. + Required. The path to the document.json file. Returns: Document: A document from local document_path. """ - with open(document_path, "r") as f: - doc = documentai.Document.from_json(f.read()) + with open(document_path, "r", encoding="utf-8") as f: + doc = documentai.Document.from_json(f.read(), ignore_unknown_fields=True) return cls(shards=[doc]) @@ -298,13 +286,11 @@ def from_gcs(cls, gcs_bucket_name: str, gcs_prefix: str): gcs_bucket_name (str): Required. The gcs bucket. - Format: Given `gs://{bucket_name}/{optional_folder}/{operation_id}/` - gcs_bucket_name="{bucket_name}". + Format: Given `gs://{bucket_name}/{optional_folder}/{operation_id}/` where gcs_bucket_name=`{bucket_name}`. gcs_prefix (str): Required. The prefix to the location of the target folder. - Format: Given `gs://{bucket_name}/{optional_folder}/{target_folder}/` - gcs_prefix="{optional_folder}/{target_folder}". + Format: Given `gs://{bucket_name}/optional_folder/target_folder` where gcs_prefix=`{optional_folder}/{target_folder}`. Returns: Document: A document from gcs. @@ -330,9 +316,7 @@ def search_pages( A list of Pages. """ - if (target_string is None and pattern is None) or ( - target_string is not None and pattern is not None - ): + if (target_string and pattern) or (not target_string and not pattern): raise ValueError( "Exactly one of target_string and pattern must be specified." ) @@ -340,12 +324,8 @@ def search_pages( found_pages = [] for page in self.pages: for paragraph in page.paragraphs: - if target_string is not None and target_string in paragraph.text: - found_pages.append(page) - break - elif ( - pattern is not None - and re.search(pattern, paragraph.text) is not None + if (target_string and target_string in paragraph.text) or ( + pattern and re.search(pattern, paragraph.text) ): found_pages.append(page) break diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py index 9b4a901b..bafda584 100644 --- a/google/cloud/documentai_toolbox/wrappers/page.py +++ b/google/cloud/documentai_toolbox/wrappers/page.py @@ -54,18 +54,14 @@ def to_dataframe(self) -> pd.DataFrame: The DataFrame of the table. """ - dataframe = None - if not self.body_rows: - dataframe = pd.DataFrame(columns=self.header_rows) - else: - if self.header_rows != []: - dataframe = pd.DataFrame(self.body_rows) - dataframe.columns = self.header_rows - else: + return pd.DataFrame(columns=self.header_rows) - dataframe = pd.DataFrame(self.body_rows) - dataframe.columns = [None] * len(self.body_rows[0]) + dataframe = pd.DataFrame(self.body_rows) + if self.header_rows: + dataframe.columns = self.header_rows + else: + dataframe.columns = [None] * len(self.body_rows[0]) return dataframe @@ -102,13 +98,13 @@ def sample_table_to_csv(): def _table_wrapper_from_documentai_table( - documentai_table: List[documentai.Document.Page.Table], text: str + documentai_table: documentai.Document.Page.Table, text: str ) -> Table: r"""Returns a Table. Args: - documentai_tables (List[documentai.Document.Page.Table]): - Required. A list of documentai.Document.Page.Table. + documentai_table (documentai.Document.Page.Table): + Required. A documentai.Document.Page.Table. text (str): Required. UTF-8 encoded text in reading order from the document. @@ -119,22 +115,17 @@ def _table_wrapper_from_documentai_table( """ - header_rows = [] - body_rows = [] - - header_rows = _table_row_from_documentai_table_row( + header_rows = _table_rows_from_documentai_table_rows( table_rows=documentai_table.header_rows, text=text ) - body_rows = _table_row_from_documentai_table_row( + body_rows = _table_rows_from_documentai_table_rows( table_rows=documentai_table.body_rows, text=text ) - result = Table( + return Table( documentai_table=documentai_table, body_rows=body_rows, header_rows=header_rows ) - return result - @dataclasses.dataclass class Paragraph: @@ -185,13 +176,11 @@ def _text_from_element_with_layout( result_text = "" - if element_with_layout.layout.text_anchor.text_segments == []: + if not element_with_layout.layout.text_anchor.text_segments: return "" - else: - for text_segment in element_with_layout.layout.text_anchor.text_segments: - result_text += text[ - int(text_segment.start_index) : int(text_segment.end_index) - ] + + for text_segment in element_with_layout.layout.text_anchor.text_segments: + result_text += text[int(text_segment.start_index) : int(text_segment.end_index)] return result_text @@ -254,10 +243,10 @@ def _get_lines(lines: List[documentai.Document.Page.Line], text: str) -> List[Li return result -def _table_row_from_documentai_table_row( +def _table_rows_from_documentai_table_rows( table_rows: List[documentai.Document.Page.Table.TableRow], text: str ) -> List[str]: - r"""Returns a list rows from table_rows. + r"""Returns a list of rows from table_rows. Args: table_rows (List[documentai.Document.Page.Table.TableRow]): diff --git a/samples/__init__.py b/samples/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/samples/snippets/__init__.py b/samples/snippets/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/samples/snippets/noxfile.py b/samples/snippets/noxfile.py new file mode 100644 index 00000000..22c65bbd --- /dev/null +++ b/samples/snippets/noxfile.py @@ -0,0 +1,285 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import glob +import os +from pathlib import Path +import sys +from typing import Callable, Dict, List, Optional + +import nox + + +# WARNING - WARNING - WARNING - WARNING - WARNING +# WARNING - WARNING - WARNING - WARNING - WARNING +# DO NOT EDIT THIS FILE EVER! +# WARNING - WARNING - WARNING - WARNING - WARNING +# WARNING - WARNING - WARNING - WARNING - WARNING + +# Copy `noxfile_config.py` to your directory and modify it instead. + + +# `TEST_CONFIG` dict is a configuration hook that allows users to +# modify the test configurations. The values here should be in sync +# with `noxfile_config.py`. Users will copy `noxfile_config.py` into +# their directory and modify it. + +TEST_CONFIG = { + # You can opt out from the test for specific Python versions. + "ignored_versions": ["2.7", "3.6"], + # Old samples are opted out of enforcing Python type hints + # All new samples should feature them + "enforce_type_hints": False, + # An envvar key for determining the project id to use. Change it + # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a + # build specific Cloud project. You can also use your own string + # to use your own Cloud project. + "gcloud_project_env": "GOOGLE_CLOUD_PROJECT", + # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT', + # If you need to use a specific version of pip, + # change pip_version_override to the string representation + # of the version number, for example, "20.2.4" + "pip_version_override": None, + # A dictionary you want to inject into your test. Don't put any + # secrets here. These values will override predefined values. + "envs": {}, +} + + +try: + # Ensure we can import noxfile_config in the project's directory. + sys.path.append(".") + from noxfile_config import TEST_CONFIG_OVERRIDE +except ImportError as e: + print("No user noxfile_config found: detail: {}".format(e)) + TEST_CONFIG_OVERRIDE = {} + +# Update the TEST_CONFIG with the user supplied values. +TEST_CONFIG.update(TEST_CONFIG_OVERRIDE) + + +def get_pytest_env_vars() -> Dict[str, str]: + """Returns a dict for pytest invocation.""" + ret = {} + + # Override the GCLOUD_PROJECT and the alias. + env_key = TEST_CONFIG["gcloud_project_env"] + # This should error out if not set. + ret["GOOGLE_CLOUD_PROJECT"] = os.environ[env_key] + ret["GCLOUD_PROJECT"] = os.environ[env_key] # deprecated + + # Apply user supplied envs. + ret.update(TEST_CONFIG["envs"]) + return ret + + +# DO NOT EDIT - automatically generated. +# All versions used to tested samples. +ALL_VERSIONS = ["2.7", "3.6", "3.7", "3.8", "3.9", "3.10"] + +# Any default versions that should be ignored. +IGNORED_VERSIONS = TEST_CONFIG["ignored_versions"] + +TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS]) + +INSTALL_LIBRARY_FROM_SOURCE = bool(os.environ.get("INSTALL_LIBRARY_FROM_SOURCE", False)) + +# Error if a python version is missing +nox.options.error_on_missing_interpreters = True + +# +# Style Checks +# + + +def _determine_local_import_names(start_dir: str) -> List[str]: + """Determines all import names that should be considered "local". + + This is used when running the linter to insure that import order is + properly checked. + """ + file_ext_pairs = [os.path.splitext(path) for path in os.listdir(start_dir)] + return [ + basename + for basename, extension in file_ext_pairs + if extension == ".py" + or os.path.isdir(os.path.join(start_dir, basename)) + and basename not in ("__pycache__") + ] + + +# Linting with flake8. +# +# We ignore the following rules: +# ANN101: missing type annotation for self in method +# E203: whitespace before ‘:’ +# E266: too many leading ‘#’ for block comment +# E501: line too long +# I202: Additional newline in a section of imports +# +# We also need to specify the rules which are ignored by default: +# ['E226', 'W504', 'E126', 'E123', 'W503', 'E24', 'E704', 'E121'] +FLAKE8_COMMON_ARGS = [ + "--show-source", + "--builtin=gettext", + "--max-complexity=20", + "--import-order-style=google", + "--exclude=.nox,.cache,env,lib,generated_pb2,*_pb2.py,*_pb2_grpc.py", + "--ignore=ANN101,E121,E123,E126,E203,E226,E24,E266,E501,E704,W503,W504,I202", + "--max-line-length=88", +] + + +@nox.session +def lint(session: nox.sessions.Session) -> None: + if not TEST_CONFIG["enforce_type_hints"]: + session.install("flake8", "flake8-import-order") + else: + session.install("flake8", "flake8-import-order", "flake8-annotations") + + local_names = _determine_local_import_names(".") + args = FLAKE8_COMMON_ARGS + [ + "--application-import-names", + ",".join(local_names), + ".", + ] + session.run("flake8", *args) + + +# +# Black +# + + +@nox.session +def blacken(session: nox.sessions.Session) -> None: + session.install("black") + python_files = [path for path in os.listdir(".") if path.endswith(".py")] + + session.run("black", *python_files) + + +# +# Sample Tests +# + + +PYTEST_COMMON_ARGS = ["--junitxml=sponge_log.xml"] + + +def _session_tests( + session: nox.sessions.Session, post_install: Callable = None +) -> None: + # check for presence of tests + test_list = glob.glob("*_test.py") + glob.glob("test_*.py") + test_list.extend(glob.glob("tests")) + + if len(test_list) == 0: + print("No tests found, skipping directory.") + return + + if TEST_CONFIG["pip_version_override"]: + pip_version = TEST_CONFIG["pip_version_override"] + session.install(f"pip=={pip_version}") + """Runs py.test for a particular project.""" + concurrent_args = [] + if os.path.exists("requirements.txt"): + if os.path.exists("constraints.txt"): + session.install("-r", "requirements.txt", "-c", "constraints.txt") + else: + session.install("-r", "requirements.txt") + with open("requirements.txt") as rfile: + packages = rfile.read() + + if os.path.exists("requirements-test.txt"): + if os.path.exists("constraints-test.txt"): + session.install( + "-r", "requirements-test.txt", "-c", "constraints-test.txt" + ) + else: + session.install("-r", "requirements-test.txt") + with open("requirements-test.txt") as rtfile: + packages += rtfile.read() + + if INSTALL_LIBRARY_FROM_SOURCE: + session.install("-e", _get_repo_root()) + + if post_install: + post_install(session) + + if "pytest-parallel" in packages: + concurrent_args.extend(['--workers', 'auto', '--tests-per-worker', 'auto']) + elif "pytest-xdist" in packages: + concurrent_args.extend(['-n', 'auto']) + + session.run( + "pytest", + *(PYTEST_COMMON_ARGS + session.posargs + concurrent_args), + # Pytest will return 5 when no tests are collected. This can happen + # on travis where slow and flaky tests are excluded. + # See http://doc.pytest.org/en/latest/_modules/_pytest/main.html + success_codes=[0, 5], + env=get_pytest_env_vars(), + ) + + +@nox.session(python=ALL_VERSIONS) +def py(session: nox.sessions.Session) -> None: + """Runs py.test for a sample using the specified version of Python.""" + if session.python in TESTED_VERSIONS: + _session_tests(session) + else: + session.skip( + "SKIPPED: {} tests are disabled for this sample.".format(session.python) + ) + + +# +# Readmegen +# + + +def _get_repo_root() -> Optional[str]: + """Returns the root folder of the project.""" + # Get root of this repository. + # Assume we don't have directories nested deeper than 10 items. + p = Path(os.getcwd()) + for i in range(10): + if p is None: + break + if Path(p / ".git").exists(): + return str(p) + p = p.parent + raise Exception("Unable to detect repository root.") + + +GENERATED_READMES = sorted([x for x in Path(".").rglob("*.rst.in")]) + + +@nox.session +@nox.parametrize("path", GENERATED_READMES) +def readmegen(session: nox.sessions.Session, path: str) -> None: + """(Re-)generates the readme for a sample.""" + session.install("jinja2", "pyyaml") + dir_ = os.path.dirname(path) + + if os.path.exists(os.path.join(dir_, "requirements.txt")): + session.install("-r", os.path.join(dir_, "requirements.txt")) + + in_file = os.path.join(dir_, "README.rst.in") + session.run( + "python", _get_repo_root() + "/scripts/readme-gen/readme_gen.py", in_file + ) \ No newline at end of file diff --git a/samples/snippets/quickstart_sample.py b/samples/snippets/quickstart_sample.py new file mode 100644 index 00000000..e7c25fdb --- /dev/null +++ b/samples/snippets/quickstart_sample.py @@ -0,0 +1,45 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +# [START documentai_toolbox_quickstart] + +from google.cloud.documentai_toolbox import document + +# TODO(developer): Uncomment these variables before running the sample. +# Given a document.proto or sharded document.proto in path gs://bucket/path/to/folder +# gcs_bucket_name = "bucket" +# gcs_prefix = "path/to/folder" + + +def quickstart_sample(gcs_bucket_name: str, gcs_prefix: str) -> None: + wrapped_document = document.Document.from_gcs( + gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix + ) + + print("Document Successfully Loaded!") + print(f"\t Number of Pages: {len(wrapped_document.pages)}") + print(f"\t Number of Entities: {len(wrapped_document.entities)}") + + for idx, page in enumerate(wrapped_document.pages): + print(f"Page {idx}") + for paragraph in page.paragraphs: + print(paragraph.text) + + for entity in wrapped_document.entities: + print(f"{entity.type_} : {entity.mention_text}") + + +# [END documentai_toolbox_quickstart] diff --git a/samples/snippets/requirements-test.txt b/samples/snippets/requirements-test.txt new file mode 100644 index 00000000..8d88a60a --- /dev/null +++ b/samples/snippets/requirements-test.txt @@ -0,0 +1,2 @@ +pytest==7.2.1 +mock==5.0.1 \ No newline at end of file diff --git a/samples/snippets/requirements.txt b/samples/snippets/requirements.txt new file mode 100644 index 00000000..78dd8e3f --- /dev/null +++ b/samples/snippets/requirements.txt @@ -0,0 +1,3 @@ +google-cloud-documentai==1.2.1 +google-cloud-storage==2.7.0 +google-cloud-documentai-toolbox==0.1.0a0 diff --git a/samples/snippets/test_quickstart_sample.py b/samples/snippets/test_quickstart_sample.py new file mode 100644 index 00000000..fbd69b34 --- /dev/null +++ b/samples/snippets/test_quickstart_sample.py @@ -0,0 +1,34 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os + +import pytest +from samples.snippets import quickstart_sample + +location = "us" +project_id = os.environ["GOOGLE_CLOUD_PROJECT"] +gcs_bucket_name = "documentai_toolbox_samples" +gcs_input_uri = "output/123456789/0" + + +def test_quickstart_sample(capsys: pytest.CaptureFixture) -> None: + quickstart_sample.quickstart_sample( + gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_input_uri + ) + out, _ = capsys.readouterr() + + assert "Number of Pages: 1" in out + assert "Number of Entities: 22" in out diff --git a/setup.py b/setup.py index e42ea5a5..f731a68e 100644 --- a/setup.py +++ b/setup.py @@ -38,17 +38,18 @@ version=version, license="Apache 2.0", long_description=readme, + long_description_content_type="text/x-rst", packages=setuptools.PEP420PackageFinder.find(), namespace_packages=("google", "google.cloud"), platforms="Posix; MacOS X; Windows", include_package_data=True, install_requires=( "google-api-core >= 1.31.5, <3.0.0dev,!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0", - "pandas >= 1.0.0, <1.5.0", + "pandas >= 1.0.0, <2.0.0", "proto-plus >= 1.22.0, <2.0.0dev", "proto-plus >= 1.22.2, <2.0.0dev; python_version>='3.11'", "grpc-google-iam-v1 >= 0.12.4, < 0.13dev", - "google-cloud-documentai >= 1.2.1, < 2.0.0dev", + "google-cloud-documentai >= 1.2.1, < 3.0.0dev", "google-cloud-storage >= 1.31.0, < 3.0.0dev", "numpy >= 1.18.1", ), diff --git a/tests/unit/test_page.py b/tests/unit/test_page.py index d0a5c2fa..6b493673 100644 --- a/tests/unit/test_page.py +++ b/tests/unit/test_page.py @@ -22,7 +22,9 @@ @pytest.fixture def docproto(): - with open("tests/unit/resources/0/toolbox_invoice_test-0.json", "r") as f: + with open( + "tests/unit/resources/0/toolbox_invoice_test-0.json", "r", encoding="utf-8" + ) as f: return documentai.Document.from_json(f.read()) @@ -117,22 +119,22 @@ def test_table_wrapper_from_documentai_table(docproto): assert len(table.header_rows[0]) == 4 -def test_header_for_table_row_from_documentai_table_row(docproto): +def test_header_for_table_rows_from_documentai_table_rows(docproto): docproto_page = docproto.pages[0] - header_row = page._table_row_from_documentai_table_row( + header_rows = page._table_rows_from_documentai_table_rows( table_rows=docproto_page.tables[0].header_rows, text=docproto.text ) - assert header_row == [["Item Description", "Quantity", "Price", "Amount"]] + assert header_rows == [["Item Description", "Quantity", "Price", "Amount"]] -def test_body_for_table_row_from_documentai_table_row(docproto): +def test_body_for_table_rows_from_documentai_table_rows(docproto): docproto_page = docproto.pages[0] - body_row = page._table_row_from_documentai_table_row( + body_rows = page._table_rows_from_documentai_table_rows( table_rows=docproto_page.tables[0].body_rows, text=docproto.text ) - assert body_row == [ + assert body_rows == [ ["Tool A", "500", "$1.00", "$500.00"], ["Service B", "1", "$900.00", "$900.00"], ["Resource C", "50", "$12.00", "$600.00"],