1
+ #!/usr/bin/env python
2
+
3
+ # Copyright 2019 Google LLC. All Rights Reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import argparse
18
+
19
+
20
+ # [START document_parse_key_value]
21
+ def parse_key_value_gcs (project_id , gcs_source_uri , gcs_destination_uri ):
22
+ """Parse key-value pairs with PDF/TIFF as source files on Google Cloud Storage."""
23
+ import re
24
+ from google .cloud import documentai
25
+ from google .cloud .documentai import types
26
+ from google .cloud import storage
27
+ from google .protobuf import json_format
28
+
29
+ client = documentai .DocumentUnderstandingServiceClient ()
30
+
31
+ gcs_source = types .GcsSource (uri = gcs_source_uri )
32
+ input_config = types .InputConfig (gcs_source = gcs_source , mime_type = 'application/pdf' )
33
+
34
+ # How many pages should be grouped into each json output file.
35
+ pages_per_shard = 1
36
+ gcs_destination = types .GcsDestination (uri = gcs_destination_uri )
37
+ output_config = types .OutputConfig (gcs_destination = gcs_destination , pages_per_shard = pages_per_shard )
38
+
39
+ # Provide key-value pair hints.
40
+ # For each key hint, key is some text that is likely to appear in the
41
+ # document as key, value types are optional, but can be one or more of DATE,
42
+ # LOCATION, ORGANIZATION, etc.
43
+ # Accepted value types: ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID, NUMBER, EMAIL, PRICE, TERMS, DATE, NAME
44
+ key_value_pair_hints = [
45
+ types .KeyValuePairHint (key = 'Phone' , value_types = ['PHONE_NUMBER' ]),
46
+ types .KeyValuePairHint (key = 'Contact' , value_types = ['EMAIL' , 'NAME' ])
47
+ ]
48
+
49
+ form_extraction_params = types .FormExtractionParams (enabled = True , key_value_pair_hints = key_value_pair_hints )
50
+
51
+ request = types .ProcessDocumentRequest (
52
+ input_config = input_config , output_config = output_config ,
53
+ form_extraction_params = form_extraction_params )
54
+
55
+ requests = [request ]
56
+
57
+ print ('Waiting for operation to finish.' )
58
+ parent = 'projects/{}' .format (project_id )
59
+ operation = client .batch_process_documents (requests , parent = parent )
60
+
61
+ result = operation .result (timeout = 60 )
62
+
63
+ # After the output json files have been written to GCS we can process them.
64
+ storage_client = storage .Client ()
65
+
66
+ match = re .match (r'gs://([^/]+)/(.+)' , gcs_destination_uri )
67
+ bucket_name = match .group (1 )
68
+ prefix = match .group (2 )
69
+
70
+ bucket = storage_client .get_bucket (bucket_name )
71
+
72
+ blob_list = list (bucket .list_blobs (prefix = prefix ))
73
+ print ('Output files:' )
74
+ for blob in blob_list :
75
+ print (blob .name )
76
+
77
+ # Process the first output. We specified pages_per_shard=1, so this corresponds to the data extracted from the first first page of the document.
78
+ first_output = blob_list [0 ]
79
+ json = first_output .download_as_string ()
80
+
81
+ response = json_format .Parse (json , types .Document (), ignore_unknown_fields = True )
82
+
83
+ def get_text (text_anchor ):
84
+ text = ''
85
+ for segment in text_anchor .text_segments :
86
+ text += response .text [segment .start_index :segment .end_index ]
87
+
88
+ return text .strip ()
89
+
90
+ first_page = response .pages [0 ]
91
+
92
+ for field in first_page .form_fields :
93
+ field_name_text = get_text (field .field_name .text_anchor )
94
+ field_value_text = get_text (field .field_value .text_anchor )
95
+
96
+ print ('Extracted key-value pair: ({}, {})' .format (field_name_text , field_value_text ))
97
+ # [END document_parse_key_value]
98
+
99
+
100
+ if __name__ == '__main__' :
101
+ parser = argparse .ArgumentParser (
102
+ description = __doc__ ,
103
+ formatter_class = argparse .RawDescriptionHelpFormatter )
104
+ parser .add_argument ('project_id' )
105
+ parser .add_argument ('gcs_source_uri' )
106
+ parser .add_argument ('gcs_destination_uri' )
107
+ args = parser .parse_args ()
108
+
109
+ parse_key_value_gcs (args .project_id , args .gcs_source_uri , args .gcs_destination_uri )
0 commit comments