Skip to content

fix(documentai): refactor 'documentai_quickstart' sample for latest style guide #13256

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 43 additions & 35 deletions documentai/snippets/quickstart_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,72 +11,80 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# flake8: noqa

# [START documentai_quickstart]

from google.api_core.client_options import ClientOptions
from google.cloud import documentai # type: ignore

# TODO(developer): Uncomment these variables before running the sample.
# project_id = "YOUR_PROJECT_ID"
# location = "YOUR_PROCESSOR_LOCATION" # Format is "us" or "eu"
# file_path = "/path/to/local/pdf"
# processor_display_name = "YOUR_PROCESSOR_DISPLAY_NAME" # Must be unique per project, e.g.: "My Processor"
from google.cloud.documentai_v1.types.document import Document
from google.cloud.documentai_v1.types.processor import Processor


def quickstart(
project_id: str,
location: str,
file_path: str,
processor_display_name: str = "My Processor",
):
# You must set the `api_endpoint`if you use a location other than "us".
processor_display_name: str,
) -> tuple[Processor, Document]:
# [START documentai_quickstart]
from google.api_core.client_options import ClientOptions
from google.cloud import documentai_v1 # type: ignore

# TODO(developer): Update and uncomment these variables before running the sample.
# project_id = "MY_PROJECT_ID"

# Processor location. For example: "us" or "eu".
# location = "MY_PROCESSOR_LOCATION"

# Path for file to process.
# file_path = "/path/to/local/pdf"

# Processor display name must be unique per project.
# processor_display_name = "MY_PROCESSOR_DISPLAY_NAME"

# Set `api_endpoint` if you use a location other than "us".
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

client = documentai.DocumentProcessorServiceClient(client_options=opts)
# Initialize Document AI client.
client = documentai_v1.DocumentProcessorServiceClient(client_options=opts)

# The full resource name of the location, e.g.:
# `projects/{project_id}/locations/{location}`
# Get the full resource name of the location.
# For example: `projects/{project_id}/locations/{location}`
parent = client.common_location_path(project_id, location)

# Create a Processor
# Create a Processor.
# For available types, refer to https://cloud.google.com/document-ai/docs/create-processor
processor = client.create_processor(
parent=parent,
processor=documentai.Processor(
type_="OCR_PROCESSOR", # Refer to https://cloud.google.com/document-ai/docs/create-processor for how to get available processor types
processor=documentai_v1.Processor(
type_="OCR_PROCESSOR",
display_name=processor_display_name,
),
)

# Print the processor information
# Print the processor information.
print(f"Processor Name: {processor.name}")

# Read the file into memory
# Read the file into memory.
with open(file_path, "rb") as image:
image_content = image.read()

# Load binary data
raw_document = documentai.RawDocument(
# Load binary data.
# For supported MIME types, refer to https://cloud.google.com/document-ai/docs/file-types
raw_document = documentai_v1.RawDocument(
content=image_content,
mime_type="application/pdf", # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types
mime_type="application/pdf",
)

# Configure the process request
# `processor.name` is the full resource name of the processor, e.g.:
# `projects/{project_id}/locations/{location}/processors/{processor_id}`
request = documentai.ProcessRequest(name=processor.name, raw_document=raw_document)
# Configure the process request.
# `processor.name` is the full resource name of the processor,
# For example: `projects/{project_id}/locations/{location}/processors/{processor_id}`
request = documentai_v1.ProcessRequest(name=processor.name, raw_document=raw_document)

result = client.process_document(request=request)
document = result.document

# Read the text recognition output from the processor.
# For a full list of `Document` object attributes, reference this page:
# https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
document = result.document

# Read the text recognition output from the processor
print("The document contains the following text:")
print(document.text)
# [END documentai_quickstart]
return processor

return processor, document
40 changes: 18 additions & 22 deletions documentai/snippets/quickstart_sample_test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# # Copyright 2020 Google LLC
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -11,43 +11,39 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# flake8: noqa

import os
from uuid import uuid4

from documentai.snippets import quickstart_sample

from google.api_core.client_options import ClientOptions
from google.cloud import documentai # type: ignore
from google.cloud import documentai_v1

location = "us"
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
processor_display_name = f"test-processor-{uuid4()}"
file_path = "resources/invoice.pdf"
LOCATION = "us"
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]
PROCESSOR_DISPLAY_NAME = f"test-processor-{uuid4()}"
FILE_PATH = "resources/invoice.pdf"


def test_quickstart(capsys):
processor = quickstart_sample.quickstart(
project_id=project_id,
location=location,
processor_display_name=processor_display_name,
file_path=file_path,
def test_quickstart() -> None:
processor, document = quickstart_sample.quickstart(
project_id=PROJECT_ID,
location=LOCATION,
processor_display_name=PROCESSOR_DISPLAY_NAME,
file_path=FILE_PATH,
)
out, _ = capsys.readouterr()

assert processor is not None
assert "Invoice" in document.text

# Delete created processor
client = documentai.DocumentProcessorServiceClient(
client = documentai_v1.DocumentProcessorServiceClient(
client_options=ClientOptions(
api_endpoint=f"{location}-documentai.googleapis.com"
api_endpoint=f"{LOCATION}-documentai.googleapis.com"
)
)
operation = client.delete_processor(name=processor.name)

# Wait for operation to complete
operation.result()

assert "Processor Name:" in out
assert "text:" in out
assert "Invoice" in out