Skip to content

Commit 7405c00

Browse files
dizcologychenyumic
authored and
chenyumic
committed
[DO NOT MERGE] Vision API OCR PDF/TIFF sample (GoogleCloudPlatform#1420)
* add docpdf sample * import order * list blobs * filename change * add the renamed files * parse json string to AnnotateFileResponse message * show more of the response * simplify response processing to better focus on how to make the request * fix typo * linter * linter * linter
1 parent f427368 commit 7405c00

File tree

3 files changed

+149
-0
lines changed

3 files changed

+149
-0
lines changed
+110
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
#!/usr/bin/env python
2+
3+
# Copyright 2018 Google Inc. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
18+
"""OCR with PDF/TIFF as source files on GCS
19+
20+
Example:
21+
python detect_pdf.py \
22+
--gcs-source-uri gs://python-docs-samples-tests/HodgeConj.pdf \
23+
--gcs-destination-uri gs://BUCKET_NAME/PREFIX/
24+
"""
25+
26+
import argparse
27+
import re
28+
29+
from google.cloud import storage
30+
from google.cloud import vision_v1p2beta1 as vision
31+
from google.protobuf import json_format
32+
33+
34+
# [START vision_async_detect_document_ocr]
35+
def async_detect_document(gcs_source_uri, gcs_destination_uri):
36+
# Supported mime_types are: 'application/pdf' and 'image/tiff'
37+
mime_type = 'application/pdf'
38+
39+
# How many pages should be grouped into each json output file.
40+
# With a file of 5 pages
41+
batch_size = 2
42+
43+
client = vision.ImageAnnotatorClient()
44+
45+
feature = vision.types.Feature(
46+
type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION)
47+
48+
gcs_source = vision.types.GcsSource(uri=gcs_source_uri)
49+
input_config = vision.types.InputConfig(
50+
gcs_source=gcs_source, mime_type=mime_type)
51+
52+
gcs_destination = vision.types.GcsDestination(uri=gcs_destination_uri)
53+
output_config = vision.types.OutputConfig(
54+
gcs_destination=gcs_destination, batch_size=batch_size)
55+
56+
async_request = vision.types.AsyncAnnotateFileRequest(
57+
features=[feature], input_config=input_config,
58+
output_config=output_config)
59+
60+
operation = client.async_batch_annotate_files(
61+
requests=[async_request])
62+
63+
print('Waiting for the operation to finish.')
64+
operation.result(timeout=90)
65+
66+
# Once the request has completed and the output has been
67+
# written to GCS, we can list all the output files.
68+
storage_client = storage.Client()
69+
70+
match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
71+
bucket_name = match.group(1)
72+
prefix = match.group(2)
73+
74+
bucket = storage_client.get_bucket(bucket_name=bucket_name)
75+
76+
# List objects with the given prefix.
77+
blob_list = list(bucket.list_blobs(prefix=prefix))
78+
print('Output files:')
79+
for blob in blob_list:
80+
print(blob.name)
81+
82+
# Process the first output file from GCS.
83+
# Since we specified batch_size=2, the first response contains
84+
# the first two pages of the input file.
85+
output = blob_list[0]
86+
87+
json_string = output.download_as_string()
88+
response = json_format.Parse(
89+
json_string, vision.types.AnnotateFileResponse())
90+
91+
# The actual response for the first page of the input file.
92+
first_page_response = response.responses[0]
93+
annotation = first_page_response.full_text_annotation
94+
95+
# Here we print the full text from the first page.
96+
# The response contains more information:
97+
# annotation/pages/blocks/paragraphs/words/symbols
98+
# including confidence scores and bounding boxes
99+
print(u'Full text:\n{}'.format(
100+
annotation.text))
101+
# [END vision_async_detect_document_ocr]
102+
103+
104+
if __name__ == '__main__':
105+
parser = argparse.ArgumentParser()
106+
parser.add_argument('--gcs-source-uri', required=True)
107+
parser.add_argument('--gcs-destination-uri', required=True)
108+
109+
args = parser.parse_args()
110+
async_detect_document(args.gcs_source_uri, args.gcs_destination_uri)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Copyright 2018 Google Inc. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import os
16+
17+
from google.cloud import storage
18+
19+
from detect_pdf import async_detect_document
20+
21+
BUCKET = os.environ['CLOUD_STORAGE_BUCKET']
22+
OUTPUT_PREFIX = 'OCR_PDF_TEST_OUTPUT'
23+
GCS_SOURCE_URI = 'gs://{}/HodgeConj.pdf'.format(BUCKET)
24+
GCS_DESTINATION_URI = 'gs://{}/{}/'.format(BUCKET, OUTPUT_PREFIX)
25+
26+
27+
def test_async_detect_document(capsys):
28+
async_detect_document(
29+
gcs_source_uri=GCS_SOURCE_URI,
30+
gcs_destination_uri=GCS_DESTINATION_URI)
31+
out, _ = capsys.readouterr()
32+
33+
assert 'Hodge conjecture' in out
34+
35+
storage_client = storage.Client()
36+
bucket = storage_client.get_bucket(BUCKET)
37+
for blob in bucket.list_blobs(prefix=OUTPUT_PREFIX):
38+
blob.delete()
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
google-cloud-vision==0.30.1
2+
google-cloud-storage==1.6.0

0 commit comments

Comments
 (0)