|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | +# Copyright 2018 Google Inc. All Rights Reserved. |
| 4 | +# |
| 5 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | +# you may not use this file except in compliance with the License. |
| 7 | +# You may obtain a copy of the License at |
| 8 | +# |
| 9 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +# |
| 11 | +# Unless required by applicable law or agreed to in writing, software |
| 12 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | +# See the License for the specific language governing permissions and |
| 15 | +# limitations under the License. |
| 16 | + |
| 17 | + |
| 18 | +"""OCR with PDF/TIFF as source files on GCS |
| 19 | +
|
| 20 | +Example: |
| 21 | + python detect_pdf.py \ |
| 22 | + --gcs-source-uri gs://python-docs-samples-tests/HodgeConj.pdf \ |
| 23 | + --gcs-destination-uri gs://BUCKET_NAME/PREFIX/ |
| 24 | +""" |
| 25 | + |
| 26 | +import argparse |
| 27 | +import re |
| 28 | + |
| 29 | +from google.cloud import storage |
| 30 | +from google.cloud import vision_v1p2beta1 as vision |
| 31 | +from google.protobuf import json_format |
| 32 | + |
| 33 | + |
| 34 | +# [START vision_async_detect_document_ocr] |
| 35 | +def async_detect_document(gcs_source_uri, gcs_destination_uri): |
| 36 | + # Supported mime_types are: 'application/pdf' and 'image/tiff' |
| 37 | + mime_type = 'application/pdf' |
| 38 | + |
| 39 | + # How many pages should be grouped into each json output file. |
| 40 | + # With a file of 5 pages |
| 41 | + batch_size = 2 |
| 42 | + |
| 43 | + client = vision.ImageAnnotatorClient() |
| 44 | + |
| 45 | + feature = vision.types.Feature( |
| 46 | + type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION) |
| 47 | + |
| 48 | + gcs_source = vision.types.GcsSource(uri=gcs_source_uri) |
| 49 | + input_config = vision.types.InputConfig( |
| 50 | + gcs_source=gcs_source, mime_type=mime_type) |
| 51 | + |
| 52 | + gcs_destination = vision.types.GcsDestination(uri=gcs_destination_uri) |
| 53 | + output_config = vision.types.OutputConfig( |
| 54 | + gcs_destination=gcs_destination, batch_size=batch_size) |
| 55 | + |
| 56 | + async_request = vision.types.AsyncAnnotateFileRequest( |
| 57 | + features=[feature], input_config=input_config, |
| 58 | + output_config=output_config) |
| 59 | + |
| 60 | + operation = client.async_batch_annotate_files( |
| 61 | + requests=[async_request]) |
| 62 | + |
| 63 | + print('Waiting for the operation to finish.') |
| 64 | + operation.result(timeout=90) |
| 65 | + |
| 66 | + # Once the request has completed and the output has been |
| 67 | + # written to GCS, we can list all the output files. |
| 68 | + storage_client = storage.Client() |
| 69 | + |
| 70 | + match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri) |
| 71 | + bucket_name = match.group(1) |
| 72 | + prefix = match.group(2) |
| 73 | + |
| 74 | + bucket = storage_client.get_bucket(bucket_name=bucket_name) |
| 75 | + |
| 76 | + # List objects with the given prefix. |
| 77 | + blob_list = list(bucket.list_blobs(prefix=prefix)) |
| 78 | + print('Output files:') |
| 79 | + for blob in blob_list: |
| 80 | + print(blob.name) |
| 81 | + |
| 82 | + # Process the first output file from GCS. |
| 83 | + # Since we specified batch_size=2, the first response contains |
| 84 | + # the first two pages of the input file. |
| 85 | + output = blob_list[0] |
| 86 | + |
| 87 | + json_string = output.download_as_string() |
| 88 | + response = json_format.Parse( |
| 89 | + json_string, vision.types.AnnotateFileResponse()) |
| 90 | + |
| 91 | + # The actual response for the first page of the input file. |
| 92 | + first_page_response = response.responses[0] |
| 93 | + annotation = first_page_response.full_text_annotation |
| 94 | + |
| 95 | + # Here we print the full text from the first page. |
| 96 | + # The response contains more information: |
| 97 | + # annotation/pages/blocks/paragraphs/words/symbols |
| 98 | + # including confidence scores and bounding boxes |
| 99 | + print(u'Full text:\n{}'.format( |
| 100 | + annotation.text)) |
| 101 | +# [END vision_async_detect_document_ocr] |
| 102 | + |
| 103 | + |
| 104 | +if __name__ == '__main__': |
| 105 | + parser = argparse.ArgumentParser() |
| 106 | + parser.add_argument('--gcs-source-uri', required=True) |
| 107 | + parser.add_argument('--gcs-destination-uri', required=True) |
| 108 | + |
| 109 | + args = parser.parse_args() |
| 110 | + async_detect_document(args.gcs_source_uri, args.gcs_destination_uri) |
0 commit comments