Skip to content

Commit bc7d946

Browse files
committed
add parse_key_value and test
1 parent 987fc4f commit bc7d946

File tree

11 files changed

+167
-7
lines changed

11 files changed

+167
-7
lines changed
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
#!/usr/bin/env python
2+
3+
# Copyright 2019 Google LLC. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
import argparse
18+
19+
20+
# [START document_parse_key_value]
21+
def parse_key_value_gcs(project_id, gcs_source_uri, gcs_destination_uri):
22+
"""Parse key-value pairs with PDF/TIFF as source files on Google Cloud Storage."""
23+
import re
24+
from google.cloud import documentai
25+
from google.cloud.documentai import types
26+
from google.cloud import storage
27+
from google.protobuf import json_format
28+
29+
client = documentai.DocumentUnderstandingServiceClient()
30+
31+
gcs_source = types.GcsSource(uri=gcs_source_uri)
32+
input_config = types.InputConfig(gcs_source=gcs_source, mime_type='application/pdf')
33+
34+
# How many pages should be grouped into each json output file.
35+
pages_per_shard = 1
36+
gcs_destination = types.GcsDestination(uri=gcs_destination_uri)
37+
output_config = types.OutputConfig(gcs_destination=gcs_destination, pages_per_shard=pages_per_shard)
38+
39+
# Provide key-value pair hints.
40+
# For each key hint, key is some text that is likely to appear in the
41+
# document as key, value types are optional, but can be one or more of DATE,
42+
# LOCATION, ORGANIZATION, etc.
43+
# Accepted value types: ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID, NUMBER, EMAIL, PRICE, TERMS, DATE, NAME
44+
key_value_pair_hints = [
45+
types.KeyValuePairHint(key='Phone', value_types=['PHONE_NUMBER']),
46+
types.KeyValuePairHint(key='Contact', value_types=['EMAIL', 'NAME'])
47+
]
48+
49+
form_extraction_params = types.FormExtractionParams(enabled=True, key_value_pair_hints=key_value_pair_hints)
50+
51+
request = types.ProcessDocumentRequest(
52+
input_config=input_config, output_config=output_config,
53+
form_extraction_params=form_extraction_params)
54+
55+
requests = [request]
56+
57+
print('Waiting for operation to finish.')
58+
parent = 'projects/{}'.format(project_id)
59+
operation = client.batch_process_documents(requests, parent=parent)
60+
61+
result = operation.result(timeout=60)
62+
63+
# After the output json files have been written to GCS we can process them.
64+
storage_client = storage.Client()
65+
66+
match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
67+
bucket_name = match.group(1)
68+
prefix = match.group(2)
69+
70+
bucket = storage_client.get_bucket(bucket_name)
71+
72+
blob_list = list(bucket.list_blobs(prefix=prefix))
73+
print('Output files:')
74+
for blob in blob_list:
75+
print(blob.name)
76+
77+
# Process the first output. We specified pages_per_shard=1, so this corresponds to the data extracted from the first first page of the document.
78+
first_output = blob_list[0]
79+
json = first_output.download_as_string()
80+
81+
response = json_format.Parse(json, types.Document(), ignore_unknown_fields=True)
82+
83+
def get_text(text_anchor):
84+
text = ''
85+
for segment in text_anchor.text_segments:
86+
text += response.text[segment.start_index:segment.end_index]
87+
88+
return text.strip()
89+
90+
first_page = response.pages[0]
91+
92+
for field in first_page.form_fields:
93+
field_name_text = get_text(field.field_name.text_anchor)
94+
field_value_text = get_text(field.field_value.text_anchor)
95+
96+
print('Extracted key-value pair: ({}, {})'.format(field_name_text, field_value_text))
97+
# [END document_parse_key_value]
98+
99+
100+
if __name__ == '__main__':
101+
parser = argparse.ArgumentParser(
102+
description=__doc__,
103+
formatter_class=argparse.RawDescriptionHelpFormatter)
104+
parser.add_argument('project_id')
105+
parser.add_argument('gcs_source_uri')
106+
parser.add_argument('gcs_destination_uri')
107+
args = parser.parse_args()
108+
109+
parse_key_value_gcs(args.project_id, args.gcs_source_uri, args.gcs_destination_uri)
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# Copyright 2019 Google LLC All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import os
15+
import uuid
16+
17+
import pytest
18+
19+
from google.cloud import storage
20+
from parse_key_value import parse_key_value_gcs
21+
22+
23+
PROJECT_ID = os.getenv('GCLOUD_PROJECT')
24+
GCS_SOURCE_URI = 'gs://cloud-samples-data/documentai/invoice.pdf'
25+
BUCKET = os.environ['CLOUD_STORAGE_BUCKET']
26+
27+
28+
@pytest.fixture
29+
def gcs_destination_uri():
30+
prefix = uuid.uuid4()
31+
uri = 'gs://{}/{}/'.format(BUCKET, prefix)
32+
33+
yield uri
34+
35+
storage_client = storage.Client()
36+
bucket = storage_client.get_bucket(BUCKET)
37+
for blob in bucket.list_blobs(prefix=prefix):
38+
blob.delete()
39+
40+
41+
def test_parse_key_value_gcs(capsys, gcs_destination_uri):
42+
parse_key_value_gcs(PROJECT_ID, GCS_SOURCE_URI, gcs_destination_uri)
43+
44+
out, _ = capsys.readouterr()
45+
assert '01/01/1970' in out

document/cloud-client/parse_table.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -82,28 +82,34 @@ def parse_table_gcs(project_id, gcs_source_uri, gcs_destination_uri):
8282

8383
response = json_format.Parse(json, types.Document(), ignore_unknown_fields=True)
8484

85-
def get_cell_text(cell):
85+
# def get_cell_text(cell):
86+
# text = ''
87+
# for segment in cell.layout.text_anchor.text_segments:
88+
# text += response.text[segment.start_index:segment.end_index]
89+
90+
# return text
91+
92+
# helper function to get the extracted text from text_anchor.
93+
def get_text(text_anchor):
8694
text = ''
87-
for segment in cell.layout.text_anchor.text_segments:
95+
for segment in text_anchor.text_segments:
8896
text += response.text[segment.start_index:segment.end_index]
8997

90-
return text
91-
92-
# import ipdb; ipdb.set_trace()
98+
return text.strip()
9399

94100
first_page = response.pages[0]
95101
first_table = first_page.tables[0]
96102

97103
first_header_row = first_table.header_rows[0]
98104
for cell in first_header_row.cells:
99105
# Get the text
100-
text = get_cell_text(cell)
106+
text = get_text(cell.layout.text_anchor)
101107
print('Header row: {}'.format(text))
102108

103109
for body_row in first_table.body_rows:
104110
print('Body row:')
105111
for cell in body_row.cells:
106-
text = get_cell_text(cell)
112+
text = get_text(cell.layout.text_anchor)
107113
print('Extracted cell: {}'.format(text))
108114
# [END document_parse_table]
109115

-257 KB
Binary file not shown.
-335 KB
Binary file not shown.
-60.8 KB
Binary file not shown.
-228 KB
Binary file not shown.
-33.2 MB
Binary file not shown.
35.5 KB
Binary file not shown.
-219 KB
Binary file not shown.

0 commit comments

Comments
 (0)