From aa4be7910bf09f16cc3dd1a2ca39d895d2750f07 Mon Sep 17 00:00:00 2001
From: Andrew Gorcester <gorcester@google.com>
Date: Fri, 16 Mar 2018 14:10:30 -0700
Subject: [PATCH 1/4] Fully update inspect_content and redact DLP samples

---
 dlp/inspect_content.py      | 385 +++++++++++++++++++++++++++++++++++-
 dlp/inspect_content_test.py | 103 ++++++++++
 dlp/redact.py               | 111 +----------
 dlp/redact_test.py          |  29 ---
 dlp/requirements.txt        |   4 +-
 5 files changed, 494 insertions(+), 138 deletions(-)

diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py
index f99e40db57c..4fb45bb34b6 100644
--- a/dlp/inspect_content.py
+++ b/dlp/inspect_content.py
@@ -21,7 +21,7 @@
 import os
 
 
-# [START inspect_string]
+# [START dlp_inspect_string]
 def inspect_string(project, content_string, info_types,
                    min_likelihood=None, max_findings=None, include_quote=True):
     """Uses the Data Loss Prevention API to analyze strings for protected data.
@@ -80,10 +80,10 @@ def inspect_string(project, content_string, info_types,
             print('Likelihood: {}'.format(finding.likelihood))
     else:
         print('No findings.')
-# [END inspect_string]
+# [END dlp_inspect_string]
 
 
-# [START inspect_file]
+# [START dlp_inspect_file]
 def inspect_file(project, filename, info_types, min_likelihood=None,
                  max_findings=None, include_quote=True, mime_type=None):
     """Uses the Data Loss Prevention API to analyze a file for protected data.
@@ -163,10 +163,10 @@ def inspect_file(project, filename, info_types, min_likelihood=None,
             print('Likelihood: {}'.format(finding.likelihood))
     else:
         print('No findings.')
-# [END inspect_file]
+# [END dlp_inspect_file]
 
 
-# [START inspect_gcs_file]
+# [START dlp_inspect_gcs]
 def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id,
                      info_types, min_likelihood=None, max_findings=None,
                      timeout=300):
@@ -192,6 +192,7 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id,
         None; the response from the API is printed to the terminal.
     """
 
+
     # Import the client library.
     import google.cloud.dlp
 
@@ -219,7 +220,7 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id,
         'limits': {'max_findings_per_request': max_findings},
     }
 
-    # Construct a cloud_storage_options dictionary with the file's URL.
+    # Construct a storage_config containing the file's URL.
     url = 'gs://{}/{}'.format(bucket, filename)
     storage_config = {
         'cloud_storage_options': {
@@ -288,7 +289,266 @@ def callback(message):
         print('No event received before the timeout. Please verify that the '
               'subscription provided is subscribed to the topic provided.')
 
-# [END inspect_gcs_file]
+# [END dlp_inspect_gcs]
+
+
+# [START dlp_inspect_datastore]
+def inspect_datastore(project, datastore_project, kind,
+                      topic_id, subscription_id, info_types, namespace_id=None,
+                      min_likelihood=None, max_findings=None, timeout=300):
+    """Uses the Data Loss Prevention API to analyze Datastore data.
+    Args:
+        project: The Google Cloud project id to use as a parent resource.
+        datastore_project: The Google Cloud project id of the target Datastore.
+        kind: The kind of the Datastore entity to inspect, e.g. 'Person'.
+        topic_id: The id of the Cloud Pub/Sub topic to which the API will
+            broadcast job completion. The topic must already exist.
+        subscription_id: The id of the Cloud Pub/Sub subscription to listen on
+            while waiting for job completion. The subscription must already
+            exist and be subscribed to the topic.
+        info_types: A list of strings representing info types to look for.
+            A full list of info type categories can be fetched from the API.
+        namespace_id: The namespace of the Datastore document, if applicable.
+        min_likelihood: A string representing the minimum likelihood threshold
+            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
+            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
+        max_findings: The maximum number of findings to report; 0 = no maximum.
+        timeout: The number of seconds to wait for a response from the API.
+    Returns:
+        None; the response from the API is printed to the terminal.
+    """
+
+    # Import the client library.
+    import google.cloud.dlp
+
+    # This sample additionally uses Cloud Pub/Sub to receive results from
+    # potentially long-running operations.
+    import google.cloud.pubsub
+
+    # This sample also uses threading.Event() to wait for the job to finish.
+    import threading
+
+    # Instantiate a client.
+    dlp = google.cloud.dlp.DlpServiceClient()
+
+    # Prepare info_types by converting the list of strings into a list of
+    # dictionaries (protos are also accepted).
+    if not info_types:
+        info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']
+    info_types = [{'name': info_type} for info_type in info_types]
+
+    # Construct the configuration dictionary. Keys which are None may
+    # optionally be omitted entirely.
+    inspect_config = {
+        'info_types': info_types,
+        'min_likelihood': min_likelihood,
+        'limits': {'max_findings_per_request': max_findings},
+    }
+
+    # Construct a storage_config containing the target Datastore info.
+    storage_config = {
+        'datastore_options': {
+            'partition_id': {
+                'project_id': datastore_project,
+                'namespace_id': namespace_id,
+            },
+            'kind': {
+                'name': kind
+            },
+        }
+    }
+
+    # Convert the project id into a full resource id.
+    parent = dlp.project_path(project)
+
+    # Tell the API where to send a notification when the job is complete.
+    actions = [{
+        'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)}
+    }]
+
+    # Construct the inspect_job, which defines the entire inspect content task.
+    inspect_job = {
+        'inspect_config': inspect_config,
+        'storage_config': storage_config,
+        'actions': actions,
+    }
+
+    operation = dlp.create_dlp_job(parent, inspect_job=inspect_job)
+
+    # Create a Pub/Sub client and find the subscription. The subscription is
+    # expected to already be listening to the topic.
+    subscriber = google.cloud.pubsub.SubscriberClient()
+    subscription_path = subscriber.subscription_path(
+        project, subscription_id)
+    subscription = subscriber.subscribe(subscription_path)
+
+    # Set up a callback to acknowledge a message. This closes around an event
+    # so that it can signal that it is done and the main thread can continue.
+    job_done = threading.Event()
+
+    def callback(message):
+        try:
+            if (message.attributes['DlpJobName'] == operation.name):
+                # This is the message we're looking for, so acknowledge it.
+                message.ack()
+
+                # Now that the job is done, fetch the results and print them.
+                job = dlp.get_dlp_job(operation.name)
+                if job.inspect_details.result.info_type_stats:
+                    for finding in job.inspect_details.result.info_type_stats:
+                        print('Info type: {}; Count: {}'.format(
+                            finding.info_type.name, finding.count))
+                else:
+                    print('No findings.')
+
+                # Signal to the main thread that we can exit.
+                job_done.set()
+            else:
+                # This is not the message we're looking for.
+                message.drop()
+        except Exception as e:
+            # Because this is executing in a thread, an exception won't be
+            # noted unless we print it manually.
+            print(e)
+            raise
+
+    # Register the callback and wait on the event.
+    subscription.open(callback)
+    finished = job_done.wait(timeout=timeout)
+    if not finished:
+        print('No event received before the timeout. Please verify that the '
+              'subscription provided is subscribed to the topic provided.')
+
+# [END dlp_inspect_datastore]
+
+
+# [START dlp_inspect_bigquery]
+def inspect_bigquery(project, bigquery_project, dataset_id, table_id,
+                     topic_id, subscription_id, info_types,
+                     min_likelihood=None, max_findings=None, timeout=300):
+    """Uses the Data Loss Prevention API to analyze BigQuery data.
+    Args:
+        project: The Google Cloud project id to use as a parent resource.
+        bigquery_project: The Google Cloud project id of the target table.
+        dataset_id: The id of the target BigQuery dataset.
+        table_id: The id of the target BigQuery table.
+        topic_id: The id of the Cloud Pub/Sub topic to which the API will
+            broadcast job completion. The topic must already exist.
+        subscription_id: The id of the Cloud Pub/Sub subscription to listen on
+            while waiting for job completion. The subscription must already
+            exist and be subscribed to the topic.
+        info_types: A list of strings representing info types to look for.
+            A full list of info type categories can be fetched from the API.
+        namespace_id: The namespace of the Datastore document, if applicable.
+        min_likelihood: A string representing the minimum likelihood threshold
+            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
+            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
+        max_findings: The maximum number of findings to report; 0 = no maximum.
+        timeout: The number of seconds to wait for a response from the API.
+    Returns:
+        None; the response from the API is printed to the terminal.
+    """
+
+    # Import the client library.
+    import google.cloud.dlp
+
+    # This sample additionally uses Cloud Pub/Sub to receive results from
+    # potentially long-running operations.
+    import google.cloud.pubsub
+
+    # This sample also uses threading.Event() to wait for the job to finish.
+    import threading
+
+    # Instantiate a client.
+    dlp = google.cloud.dlp.DlpServiceClient()
+
+    # Prepare info_types by converting the list of strings into a list of
+    # dictionaries (protos are also accepted).
+    if not info_types:
+        info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']
+    info_types = [{'name': info_type} for info_type in info_types]
+
+    # Construct the configuration dictionary. Keys which are None may
+    # optionally be omitted entirely.
+    inspect_config = {
+        'info_types': info_types,
+        'min_likelihood': min_likelihood,
+        'limits': {'max_findings_per_request': max_findings},
+    }
+
+    # Construct a storage_config containing the target Bigquery info.
+    storage_config = {
+        'big_query_options': {
+            'table_reference': {
+                'project_id': bigquery_project,
+                'dataset_id': dataset_id,
+                'table_id': table_id,
+            }
+        }
+    }
+
+    # Convert the project id into a full resource id.
+    parent = dlp.project_path(project)
+
+    # Tell the API where to send a notification when the job is complete.
+    actions = [{
+        'pub_sub': {'topic': '{}/topics/{}'.format(parent, topic_id)}
+    }]
+
+    # Construct the inspect_job, which defines the entire inspect content task.
+    inspect_job = {
+        'inspect_config': inspect_config,
+        'storage_config': storage_config,
+        'actions': actions,
+    }
+
+    operation = dlp.create_dlp_job(parent, inspect_job=inspect_job)
+
+    # Create a Pub/Sub client and find the subscription. The subscription is
+    # expected to already be listening to the topic.
+    subscriber = google.cloud.pubsub.SubscriberClient()
+    subscription_path = subscriber.subscription_path(
+        project, subscription_id)
+    subscription = subscriber.subscribe(subscription_path)
+
+    # Set up a callback to acknowledge a message. This closes around an event
+    # so that it can signal that it is done and the main thread can continue.
+    job_done = threading.Event()
+
+    def callback(message):
+        try:
+            if (message.attributes['DlpJobName'] == operation.name):
+                # This is the message we're looking for, so acknowledge it.
+                message.ack()
+
+                # Now that the job is done, fetch the results and print them.
+                job = dlp.get_dlp_job(operation.name)
+                if job.inspect_details.result.info_type_stats:
+                    for finding in job.inspect_details.result.info_type_stats:
+                        print('Info type: {}; Count: {}'.format(
+                            finding.info_type.name, finding.count))
+                else:
+                    print('No findings.')
+
+                # Signal to the main thread that we can exit.
+                job_done.set()
+            else:
+                # This is not the message we're looking for.
+                message.drop()
+        except Exception as e:
+            # Because this is executing in a thread, an exception won't be
+            # noted unless we print it manually.
+            print(e)
+            raise
+
+    # Register the callback and wait on the event.
+    subscription.open(callback)
+    finished = job_done.wait(timeout=timeout)
+    if not finished:
+        print('No event received before the timeout. Please verify that the '
+              'subscription provided is subscribed to the topic provided.')
+
+# [END dlp_inspect_bigquery]
 
 
 if __name__ == '__main__':
@@ -404,6 +664,100 @@ def callback(message):
              'API. The default is 300 seconds.',
         default=300)
 
+    parser_datastore = subparsers.add_parser(
+        'datastore', help='Inspect files on Google Datastore.')
+    parser_datastore.add_argument(
+        'datastore_project',
+        help='The Google Cloud project id of the target Datastore.')
+    parser_datastore.add_argument(
+        'kind',
+        help='The kind of the Datastore entity to inspect, e.g. "Person".')
+    parser_datastore.add_argument(
+        'topic_id',
+        help='The id of the Cloud Pub/Sub topic to use to report that the job '
+        'is complete, e.g. "dlp-sample-topic".')
+    parser_datastore.add_argument(
+        'subscription_id',
+        help='The id of the Cloud Pub/Sub subscription to monitor for job '
+        'completion, e.g. "dlp-sample-subscription". The subscription must '
+        'already be subscribed to the topic. See the test files or the Cloud '
+        'Pub/Sub sample files for examples on how to create the subscription.')
+    parser_datastore.add_argument(
+        '--project',
+        help='The Google Cloud project id to use as a parent resource.',
+        default=default_project)
+    parser_datastore.add_argument(
+        '--info_types', action='append',
+        help='Strings representing info types to look for. A full list of '
+             'info categories and types is available from the API. Examples '
+             'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
+             'If unspecified, the three above examples will be used.',
+        default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'])
+    parser_datastore.add_argument(
+        '--namespace_id',
+        help='The Datastore namespace to use, if applicable.')
+    parser_datastore.add_argument(
+        '--min_likelihood',
+        choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY',
+                 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'],
+        help='A string representing the minimum likelihood threshold that '
+             'constitutes a match.')
+    parser_datastore.add_argument(
+        '--max_findings', type=int,
+        help='The maximum number of findings to report; 0 = no maximum.')
+    parser_datastore.add_argument(
+        '--timeout', type=int,
+        help='The maximum number of seconds to wait for a response from the '
+             'API. The default is 300 seconds.',
+        default=300)
+
+    parser_bigquery = subparsers.add_parser(
+        'bigquery', help='Inspect files on Google BigQuery.')
+    parser_bigquery.add_argument(
+        'bigquery_project',
+        help='The Google Cloud project id of the target table.')
+    parser_bigquery.add_argument(
+        'dataset_id',
+        help='The ID of the target BigQuery dataset.')
+    parser_bigquery.add_argument(
+        'table_id',
+        help='The ID of the target BigQuery table.')
+    parser_bigquery.add_argument(
+        'topic_id',
+        help='The id of the Cloud Pub/Sub topic to use to report that the job '
+        'is complete, e.g. "dlp-sample-topic".')
+    parser_bigquery.add_argument(
+        'subscription_id',
+        help='The id of the Cloud Pub/Sub subscription to monitor for job '
+        'completion, e.g. "dlp-sample-subscription". The subscription must '
+        'already be subscribed to the topic. See the test files or the Cloud '
+        'Pub/Sub sample files for examples on how to create the subscription.')
+    parser_bigquery.add_argument(
+        '--project',
+        help='The Google Cloud project id to use as a parent resource.',
+        default=default_project)
+    parser_bigquery.add_argument(
+        '--info_types', action='append',
+        help='Strings representing info types to look for. A full list of '
+             'info categories and types is available from the API. Examples '
+             'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
+             'If unspecified, the three above examples will be used.',
+        default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'])
+    parser_bigquery.add_argument(
+        '--min_likelihood',
+        choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY',
+                 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'],
+        help='A string representing the minimum likelihood threshold that '
+             'constitutes a match.')
+    parser_bigquery.add_argument(
+        '--max_findings', type=int,
+        help='The maximum number of findings to report; 0 = no maximum.')
+    parser_bigquery.add_argument(
+        '--timeout', type=int,
+        help='The maximum number of seconds to wait for a response from the '
+             'API. The default is 300 seconds.',
+        default=300)
+
     args = parser.parse_args()
 
     if args.content == 'string':
@@ -427,3 +781,20 @@ def callback(message):
             min_likelihood=args.min_likelihood,
             max_findings=args.max_findings,
             timeout=args.timeout)
+    elif args.content == 'datastore':
+        inspect_datastore(
+            args.project, args.datastore_project, args.kind,
+            args.topic_id, args.subscription_id,
+            args.info_types,
+            namespace_id=args.namespace_id,
+            min_likelihood=args.min_likelihood,
+            max_findings=args.max_findings,
+            timeout=args.timeout)
+    elif args.content == 'bigquery':
+        inspect_bigquery(
+            args.project, args.bigquery_project, args.dataset_id,
+            args.table_id, args.topic_id, args.subscription_id,
+            args.info_types,
+            min_likelihood=args.min_likelihood,
+            max_findings=args.max_findings,
+            timeout=args.timeout)
diff --git a/dlp/inspect_content_test.py b/dlp/inspect_content_test.py
index 62d0770c9f2..96f09a2c11d 100644
--- a/dlp/inspect_content_test.py
+++ b/dlp/inspect_content_test.py
@@ -15,6 +15,8 @@
 import os
 
 import google.api_core.exceptions
+import google.cloud.bigquery
+import google.cloud.datastore
 import google.cloud.exceptions
 import google.cloud.pubsub
 import google.cloud.storage
@@ -30,6 +32,9 @@
 RESOURCE_FILE_NAMES = ['test.txt', 'test.png', 'harmless.txt', 'accounts.txt']
 TOPIC_ID = 'dlp-test'
 SUBSCRIPTION_ID = 'dlp-test-subscription'
+DATASTORE_KIND = 'DLP test kind'
+BIGQUERY_DATASET_ID = 'dlp_test_dataset'
+BIGQUERY_TABLE_ID = 'dlp_test_table'
 
 
 @pytest.fixture(scope='module')
@@ -94,6 +99,61 @@ def subscription_id(topic_id):
     subscriber.delete_subscription(subscription_path)
 
 
+@pytest.fixture(scope='module')
+def datastore_project():
+    # Adds test Datastore data, yields the project ID and then tears down.
+    datastore_client = google.cloud.datastore.Client()
+
+    kind = DATASTORE_KIND
+    name = 'DLP test object'
+    key = datastore_client.key(kind, name)
+    item = google.cloud.datastore.Entity(key=key)
+    item['payload'] = 'My name is Gary Smith and my email is gary@example.com'
+
+    datastore_client.put(item)
+
+    yield GCLOUD_PROJECT
+
+    datastore_client.delete(key)
+
+
+@pytest.fixture(scope='module')
+def bigquery_project():
+    # Adds test Bigquery data, yields the project ID and then tears down.
+    bigquery_client = google.cloud.bigquery.Client()
+
+    dataset_ref = bigquery_client.dataset(BIGQUERY_DATASET_ID)
+    dataset = google.cloud.bigquery.Dataset(dataset_ref)
+    try:
+        dataset = bigquery_client.create_dataset(dataset)
+    except google.api_core.exceptions.Conflict:
+        dataset = bigquery_client.get_dataset(dataset)
+
+    table_ref = dataset_ref.table(BIGQUERY_TABLE_ID)
+    table = google.cloud.bigquery.Table(table_ref)
+
+    # DO NOT SUBMIT: trim this down once we find out what works
+    table.schema = (
+        google.cloud.bigquery.SchemaField('Name', 'STRING'),
+        google.cloud.bigquery.SchemaField('Comment', 'STRING'),
+    )
+
+    try:
+        table = bigquery_client.create_table(table)
+    except google.api_core.exceptions.Conflict:
+        table = bigquery_client.get_table(table)
+
+    rows_to_insert = [
+        (u'Gary Smith', u'My email is gary@example.com',)
+    ]
+
+    bigquery_client.insert_rows(table, rows_to_insert)
+
+    yield GCLOUD_PROJECT
+
+    bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
+
+
 def test_inspect_string(capsys):
     test_string = 'My name is Gary Smith and my email is gary@example.com'
 
@@ -212,3 +272,46 @@ def test_inspect_gcs_multiple_files(bucket, topic_id, subscription_id, capsys):
     out, _ = capsys.readouterr()
     assert 'Info type: EMAIL_ADDRESS' in out
     assert 'Info type: PHONE_NUMBER' in out
+
+
+def test_inspect_datastore(
+        datastore_project, topic_id, subscription_id, capsys):
+    inspect_content.inspect_datastore(
+        GCLOUD_PROJECT,
+        datastore_project,
+        DATASTORE_KIND,
+        topic_id,
+        subscription_id,
+        ['FIRST_NAME', 'EMAIL_ADDRESS', 'PHONE_NUMBER'])
+
+    out, _ = capsys.readouterr()
+    assert 'Info type: EMAIL_ADDRESS' in out
+
+
+def test_inspect_datastore_no_results(
+        datastore_project, topic_id, subscription_id, capsys):
+    inspect_content.inspect_datastore(
+        GCLOUD_PROJECT,
+        datastore_project,
+        DATASTORE_KIND,
+        topic_id,
+        subscription_id,
+        ['PHONE_NUMBER'])
+
+    out, _ = capsys.readouterr()
+    assert 'No findings' in out
+
+
+def test_inspect_bigquery(
+        bigquery_project, topic_id, subscription_id, capsys):
+    inspect_content.inspect_bigquery(
+        GCLOUD_PROJECT,
+        bigquery_project,
+        BIGQUERY_DATASET_ID,
+        BIGQUERY_TABLE_ID,
+        topic_id,
+        subscription_id,
+        ['FIRST_NAME', 'EMAIL_ADDRESS', 'PHONE_NUMBER'])
+
+    out, _ = capsys.readouterr()
+    assert 'Info type: FIRST_NAME' in out
diff --git a/dlp/redact.py b/dlp/redact.py
index 678999d2cb4..7d7a5379293 100644
--- a/dlp/redact.py
+++ b/dlp/redact.py
@@ -22,67 +22,6 @@
 import os
 
 
-# [START redact_string]
-def redact_string(item, replace_string, info_types=None, min_likelihood=None):
-    """Uses the Data Loss Prevention API to redact protected data in a string.
-    Args:
-        item: The string to inspect.
-        replace_string: The string to use to replace protected data; for
-            instance, '***' or 'REDACTED'. An empty string is permitted.
-        info_types: A list of strings representing info types to look for.
-            A full list of info type categories can be fetched from the API. If
-            info_types is omitted, the API will use a limited default set.
-        min_likelihood: A string representing the minimum likelihood threshold
-            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
-            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
-    Returns:
-        None; the response from the API is printed to the terminal.
-    """
-    # Import the client library
-    import google.cloud.dlp_v2beta1
-
-    # Instantiate a client.
-    dlp = google.cloud.dlp_v2beta1.DlpServiceClient()
-
-    # Prepare info_types by converting the list of strings into a list of
-    # dictionaries (protos are also accepted).
-    if info_types is not None:
-        info_types = [{'name': info_type} for info_type in info_types]
-
-    # Prepare replace_configs, a list of dictionaries. Each dictionary contains
-    # an info_type and the string to which that info_type will be redacted upon
-    # detection. This sample uses the same "replace_string" for all info types,
-    # though the API supports using different ones for each type.
-    replace_configs = []
-
-    if info_types is not None:
-        for info_type in info_types:
-            replace_configs.append(
-                {'info_type': info_type,
-                 'replace_with': replace_string})
-    else:
-        # If no info_type is specified, prepare a single dictionary with only a
-        # replace_string as a catch-all.
-        replace_configs.append({'replace_with': replace_string})
-
-    # Construct the configuration dictionary. Keys which are None may
-    # optionally be omitted entirely.
-    redact_config = {
-        'info_types': info_types,
-        'min_likelihood': min_likelihood,
-    }
-
-    # Construct the items list (in this case, only one item, in string form).
-    items = [{'type': 'text/plain', 'value': item}]
-
-    # Call the API.
-    response = dlp.redact_content(redact_config, items, replace_configs)
-
-    # Print out the results.
-    print(response.items[0].value)
-# [END redact_string]
-
-
 # [START redact_image]
 def redact_image(project, filename, output_filename,
                  info_types, min_likelihood=None, mime_type=None):
@@ -168,67 +107,37 @@ def redact_image(project, filename, output_filename,
     default_project = os.environ.get('GCLOUD_PROJECT')
 
     parser = argparse.ArgumentParser(description=__doc__)
-    subparsers = parser.add_subparsers(
-        dest='content', help='Select how to submit content to the API.')
-    subparsers.required = True
-
-    parser_string = subparsers.add_parser('string', help='Redact a string.')
-    parser_string.add_argument('item', help='The string to inspect.')
-    parser_string.add_argument(
-        'replace_string',
-        help='The string to use to replace protected data; for instance, '
-             '"***" or "REDACTED".')
-    parser_string.add_argument(
-        '--info_types', action='append',
-        help='Strings representing info types to look for. A full list of '
-             'info categories and types is available from the API. Examples '
-             'include "US_MALE_NAME", "US_FEMALE_NAME", "EMAIL_ADDRESS", '
-             '"CANADA_SOCIAL_INSURANCE_NUMBER", "JAPAN_PASSPORT". If omitted, '
-             'the API will use a limited default set. Specify this flag '
-             'multiple times to specify multiple info types.')
-    parser_string.add_argument(
-        '--min_likelihood',
-        choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY',
-                 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'],
-        help='A string representing the minimum likelihood threshold that '
-             'constitutes a match.')
 
-    parser_file = subparsers.add_parser('image', help='Redact an image file.')
-    parser_file.add_argument(
+    parser.add_argument(
         'filename', help='The path to the file to inspect.')
-    parser_file.add_argument(
+    parser.add_argument(
         'output_filename',
         help='The path to which the redacted image will be written.')
-    parser_file.add_argument(
+    parser.add_argument(
         '--project',
         help='The Google Cloud project id to use as a parent resource.',
         default=default_project)
-    parser_file.add_argument(
+    parser.add_argument(
         '--info_types', action='append',
         help='Strings representing info types to look for. A full list of '
              'info categories and types is available from the API. Examples '
              'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
              'If unspecified, the three above examples will be used.',
         default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'])
-    parser_file.add_argument(
+    parser.add_argument(
         '--min_likelihood',
         choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY',
                  'POSSIBLE', 'LIKELY', 'VERY_LIKELY'],
         help='A string representing the minimum likelihood threshold that '
              'constitutes a match.')
-    parser_file.add_argument(
+    parser.add_argument(
         '--mime_type',
         help='The MIME type of the file. If not specified, the type is '
              'inferred via the Python standard library\'s mimetypes module.')
 
     args = parser.parse_args()
 
-    if args.content == 'string':
-        redact_string(
-            args.item, args.replace_string, info_types=args.info_types,
-            min_likelihood=args.min_likelihood)
-    elif args.content == 'image':
-        redact_image(
-            args.project, args.filename, args.output_filename,
-            args.info_types, min_likelihood=args.min_likelihood,
-            mime_type=args.mime_type)
+    redact_image(
+        args.project, args.filename, args.output_filename,
+        args.info_types, min_likelihood=args.min_likelihood,
+        mime_type=args.mime_type)
diff --git a/dlp/redact_test.py b/dlp/redact_test.py
index 2c95606072b..50eb826b051 100644
--- a/dlp/redact_test.py
+++ b/dlp/redact_test.py
@@ -31,35 +31,6 @@ def tempdir():
     shutil.rmtree(tempdir)
 
 
-def test_redact_string(capsys):
-    test_string = 'I am Gary and my email is gary@example.com'
-
-    redact.redact_string(test_string, 'REDACTED')
-
-    out, _ = capsys.readouterr()
-    assert 'REDACTED' in out
-
-
-def test_redact_string_with_info_types(capsys):
-    test_string = 'My email is gary@example.com and my number is 206-555-5555'
-
-    redact.redact_string(
-        test_string, 'REDACTED', info_types=['PHONE_NUMBER'])
-
-    out, _ = capsys.readouterr()
-    assert 'REDACTED' in out
-    assert out.count('REDACTED') == 1
-
-
-def test_redact_string_no_findings(capsys):
-    test_string = 'Nothing to see here'
-
-    redact.redact_string(test_string, 'REDACTED')
-
-    out, _ = capsys.readouterr()
-    assert 'REDACTED' not in out
-
-
 def test_redact_image_file(tempdir, capsys):
     test_filepath = os.path.join(RESOURCE_DIRECTORY, 'test.png')
     output_filepath = os.path.join(tempdir, 'redacted.png')
diff --git a/dlp/requirements.txt b/dlp/requirements.txt
index b973c95c668..f240b598378 100644
--- a/dlp/requirements.txt
+++ b/dlp/requirements.txt
@@ -1,3 +1,5 @@
 google-cloud-dlp==0.1.1
 google-cloud-storage==1.8.0
-google.cloud.pubsub==0.32.1
+google-cloud-pubsub==0.32.1
+google-cloud-datastore==1.6.0
+google-cloud-bigquery==0.31.0

From 64e419fb155be87b54d43a0dda552e04c8c849e0 Mon Sep 17 00:00:00 2001
From: Andrew Gorcester <gorcester@google.com>
Date: Fri, 16 Mar 2018 14:12:30 -0700
Subject: [PATCH 2/4] fix region tags for redact image

---
 dlp/redact.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dlp/redact.py b/dlp/redact.py
index 7d7a5379293..5757782fa24 100644
--- a/dlp/redact.py
+++ b/dlp/redact.py
@@ -22,7 +22,7 @@
 import os
 
 
-# [START redact_image]
+# [START dlp_redact_image]
 def redact_image(project, filename, output_filename,
                  info_types, min_likelihood=None, mime_type=None):
     """Uses the Data Loss Prevention API to redact protected data in an image.
@@ -100,7 +100,7 @@ def redact_image(project, filename, output_filename,
         f.write(response.redacted_image)
     print("Wrote {byte_count} to {filename}".format(
         byte_count=len(response.redacted_image), filename=output_filename))
-# [END redact_string]
+# [END dlp_redact_image]
 
 
 if __name__ == '__main__':

From 1c3965c8aa967a8b2e6269f0fb8daca1296cb6db Mon Sep 17 00:00:00 2001
From: Andrew Gorcester <andrew.gorcester@gmail.com>
Date: Fri, 16 Mar 2018 14:19:17 -0700
Subject: [PATCH 3/4] Fix redact docstring

---
 dlp/redact.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dlp/redact.py b/dlp/redact.py
index 5757782fa24..9e0863374e9 100644
--- a/dlp/redact.py
+++ b/dlp/redact.py
@@ -27,6 +27,7 @@ def redact_image(project, filename, output_filename,
                  info_types, min_likelihood=None, mime_type=None):
     """Uses the Data Loss Prevention API to redact protected data in an image.
     Args:
+        project: The Google Cloud project id to use as a parent resource.
         filename: The path to the file to inspect.
         output_filename: The path to which the redacted image will be written.
         info_types: A list of strings representing info types to look for.

From 52779238e480679f10ad2eb4976dafeed13d0e43 Mon Sep 17 00:00:00 2001
From: Andrew Gorcester <andrew.gorcester@gmail.com>
Date: Fri, 16 Mar 2018 14:23:12 -0700
Subject: [PATCH 4/4] Fix the redact image docstring AGAIN

---
 dlp/redact.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/dlp/redact.py b/dlp/redact.py
index 9e0863374e9..85fb9ef6458 100644
--- a/dlp/redact.py
+++ b/dlp/redact.py
@@ -31,8 +31,7 @@ def redact_image(project, filename, output_filename,
         filename: The path to the file to inspect.
         output_filename: The path to which the redacted image will be written.
         info_types: A list of strings representing info types to look for.
-            A full list of info type categories can be fetched from the API. If
-            info_types is omitted, the API will use a limited default set.
+            A full list of info type categories can be fetched from the API.
         min_likelihood: A string representing the minimum likelihood threshold
             that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
             'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.