From 8077cc7157d152379b7bb0fa884d72b66bd728d1 Mon Sep 17 00:00:00 2001
From: mwdaub <mwdaub@gmail.com>
Date: Wed, 6 Jun 2018 15:37:35 -0700
Subject: [PATCH 01/17] Add custom info type samples to inspect_content.py

Use flags to indicate dictionary word lists and regex patterns, then parse them into custom info types.
---
 dlp/inspect_content.py | 169 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 165 insertions(+), 4 deletions(-)

diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py
index 3b2d5d4a60b..a85befe733c 100644
--- a/dlp/inspect_content.py
+++ b/dlp/inspect_content.py
@@ -23,6 +23,7 @@
 
 # [START dlp_inspect_string]
 def inspect_string(project, content_string, info_types,
+                   custom_dictionaries=None, custom_regexes=None,
                    min_likelihood=None, max_findings=None, include_quote=True):
     """Uses the Data Loss Prevention API to analyze strings for protected data.
     Args:
@@ -50,10 +51,29 @@ def inspect_string(project, content_string, info_types,
     # dictionaries (protos are also accepted).
     info_types = [{'name': info_type} for info_type in info_types]
 
+    # Prepare custom_info_types by parsing the dictionary word lists and
+    # regex patterns.
+    if custom_dictionaries is None:
+      custom_dictionaries = []
+    dictionaries = [{
+        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
+        'dictionary': {
+            'word_list': {'words': custom_dictionaries[i].split(',')}
+        }
+    } for i in range(len(custom_dictionaries))]
+    if custom_regexes is None:
+      custom_regexes = []
+    regexes = [{
+        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
+        'regex': {'pattern': custom_regexes[i]}
+    } for i in range(len(custom_regexes))]
+    custom_info_types = dictionaries + regexes
+
     # Construct the configuration dictionary. Keys which are None may
     # optionally be omitted entirely.
     inspect_config = {
         'info_types': info_types,
+        'custom_info_types': custom_info_types,
         'min_likelihood': min_likelihood,
         'include_quote': include_quote,
         'limits': {'max_findings_per_request': max_findings},
@@ -85,6 +105,7 @@ def inspect_string(project, content_string, info_types,
 
 # [START dlp_inspect_file]
 def inspect_file(project, filename, info_types, min_likelihood=None,
+                 custom_dictionaries=None, custom_regexes=None,
                  max_findings=None, include_quote=True, mime_type=None):
     """Uses the Data Loss Prevention API to analyze a file for protected data.
     Args:
@@ -118,10 +139,29 @@ def inspect_file(project, filename, info_types, min_likelihood=None,
         info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']
     info_types = [{'name': info_type} for info_type in info_types]
 
+    # Prepare custom_info_types by parsing the dictionary word lists and
+    # regex patterns.
+    if custom_dictionaries is None:
+      custom_dictionaries = []
+    dictionaries = [{
+        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
+        'dictionary': {
+            'word_list': {'words': custom_dictionaries[i].split(',')}
+        }
+    } for i in range(len(custom_dictionaries))]
+    if custom_regexes is None:
+      custom_regexes = []
+    regexes = [{
+        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
+        'regex': {'pattern': custom_regexes[i]}
+    } for i in range(len(custom_regexes))]
+    custom_info_types = dictionaries + regexes
+
     # Construct the configuration dictionary. Keys which are None may
     # optionally be omitted entirely.
     inspect_config = {
         'info_types': info_types,
+        'custom_info_types': custom_info_types,
         'min_likelihood': min_likelihood,
         'limits': {'max_findings_per_request': max_findings},
     }
@@ -168,8 +208,9 @@ def inspect_file(project, filename, info_types, min_likelihood=None,
 
 # [START dlp_inspect_gcs]
 def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id,
-                     info_types, min_likelihood=None, max_findings=None,
-                     timeout=300):
+                     info_types, custom_dictionaries=None,
+                     custom_regexes=None, min_likelihood=None,
+                     max_findings=None, timeout=300):
     """Uses the Data Loss Prevention API to analyze a file on GCS.
     Args:
         project: The Google Cloud project id to use as a parent resource.
@@ -211,10 +252,29 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id,
         info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']
     info_types = [{'name': info_type} for info_type in info_types]
 
+    # Prepare custom_info_types by parsing the dictionary word lists and
+    # regex patterns.
+    if custom_dictionaries is None:
+      custom_dictionaries = []
+    dictionaries = [{
+        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
+        'dictionary': {
+            'word_list': {'words': custom_dictionaries[i].split(',')}
+        }
+    } for i in range(len(custom_dictionaries))]
+    if custom_regexes is None:
+      custom_regexes = []
+    regexes = [{
+        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
+        'regex': {'pattern': custom_regexes[i]}
+    } for i in range(len(custom_regexes))]
+    custom_info_types = dictionaries + regexes
+
     # Construct the configuration dictionary. Keys which are None may
     # optionally be omitted entirely.
     inspect_config = {
         'info_types': info_types,
+        'custom_info_types': custom_info_types,
         'min_likelihood': min_likelihood,
         'limits': {'max_findings_per_request': max_findings},
     }
@@ -293,8 +353,10 @@ def callback(message):
 
 # [START dlp_inspect_datastore]
 def inspect_datastore(project, datastore_project, kind,
-                      topic_id, subscription_id, info_types, namespace_id=None,
-                      min_likelihood=None, max_findings=None, timeout=300):
+                      topic_id, subscription_id, info_types,
+                      custom_dictionaries=None, custom_regexes=None,
+                      namespace_id=None, min_likelihood=None,
+                      max_findings=None, timeout=300):
     """Uses the Data Loss Prevention API to analyze Datastore data.
     Args:
         project: The Google Cloud project id to use as a parent resource.
@@ -336,10 +398,29 @@ def inspect_datastore(project, datastore_project, kind,
         info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']
     info_types = [{'name': info_type} for info_type in info_types]
 
+    # Prepare custom_info_types by parsing the dictionary word lists and
+    # regex patterns.
+    if custom_dictionaries is None:
+      custom_dictionaries = []
+    dictionaries = [{
+        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
+        'dictionary': {
+            'word_list': {'words': custom_dictionaries[i].split(',')}
+        }
+    } for i in range(len(custom_dictionaries))]
+    if custom_regexes is None:
+      custom_regexes = []
+    regexes = [{
+        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
+        'regex': {'pattern': custom_regexes[i]}
+    } for i in range(len(custom_regexes))]
+    custom_info_types = dictionaries + regexes
+
     # Construct the configuration dictionary. Keys which are None may
     # optionally be omitted entirely.
     inspect_config = {
         'info_types': info_types,
+        'custom_info_types': custom_info_types,
         'min_likelihood': min_likelihood,
         'limits': {'max_findings_per_request': max_findings},
     }
@@ -424,6 +505,7 @@ def callback(message):
 # [START dlp_inspect_bigquery]
 def inspect_bigquery(project, bigquery_project, dataset_id, table_id,
                      topic_id, subscription_id, info_types,
+                     custom_dictionaries=None, custom_regexes=None,
                      min_likelihood=None, max_findings=None, timeout=300):
     """Uses the Data Loss Prevention API to analyze BigQuery data.
     Args:
@@ -467,10 +549,29 @@ def inspect_bigquery(project, bigquery_project, dataset_id, table_id,
         info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']
     info_types = [{'name': info_type} for info_type in info_types]
 
+    # Prepare custom_info_types by parsing the dictionary word lists and
+    # regex patterns.
+    if custom_dictionaries is None:
+      custom_dictionaries = []
+    dictionaries = [{
+        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
+        'dictionary': {
+            'word_list': {'words': custom_dictionaries[i].split(',')}
+        }
+    } for i in range(len(custom_dictionaries))]
+    if custom_regexes is None:
+      custom_regexes = []
+    regexes = [{
+        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
+        'regex': {'pattern': custom_regexes[i]}
+    } for i in range(len(custom_regexes))]
+    custom_info_types = dictionaries + regexes
+
     # Construct the configuration dictionary. Keys which are None may
     # optionally be omitted entirely.
     inspect_config = {
         'info_types': info_types,
+        'custom_info_types': custom_info_types,
         'min_likelihood': min_likelihood,
         'limits': {'max_findings_per_request': max_findings},
     }
@@ -571,6 +672,16 @@ def callback(message):
              'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
              'If unspecified, the three above examples will be used.',
         default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'])
+    parser_string.add_argument(
+        '--custom_dictionaries', action='append',
+        help='Strings representing comma-delimited lists of dictionary words'
+             ' to search for as custom info types.'
+        default=None)
+    parser_string.add_argument(
+        '--custom_regexes', action='append',
+        help='Strings representing regex patterns to search for as custom '
+             ' info types.'
+        default=None)
     parser_string.add_argument(
         '--min_likelihood',
         choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY',
@@ -600,6 +711,16 @@ def callback(message):
              'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
              'If unspecified, the three above examples will be used.',
         default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'])
+    parser_file.add_argument(
+        '--custom_dictionaries', action='append',
+        help='Strings representing comma-delimited lists of dictionary words'
+             ' to search for as custom info types.'
+        default=None)
+    parser_file.add_argument(
+        '--custom_regexes', action='append',
+        help='Strings representing regex patterns to search for as custom '
+             ' info types.'
+        default=None)
     parser_file.add_argument(
         '--min_likelihood',
         choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY',
@@ -648,6 +769,16 @@ def callback(message):
              'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
              'If unspecified, the three above examples will be used.',
         default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'])
+    parser_gcs.add_argument(
+        '--custom_dictionaries', action='append',
+        help='Strings representing comma-delimited lists of dictionary words'
+             ' to search for as custom info types.'
+        default=None)
+    parser_gcs.add_argument(
+        '--custom_regexes', action='append',
+        help='Strings representing regex patterns to search for as custom '
+             ' info types.'
+        default=None)
     parser_gcs.add_argument(
         '--min_likelihood',
         choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY',
@@ -692,6 +823,16 @@ def callback(message):
              'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
              'If unspecified, the three above examples will be used.',
         default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'])
+    parser_datastore.add_argument(
+        '--custom_dictionaries', action='append',
+        help='Strings representing comma-delimited lists of dictionary words'
+             ' to search for as custom info types.'
+        default=None)
+    parser_datastore.add_argument(
+        '--custom_regexes', action='append',
+        help='Strings representing regex patterns to search for as custom '
+             ' info types.'
+        default=None)
     parser_datastore.add_argument(
         '--namespace_id',
         help='The Datastore namespace to use, if applicable.')
@@ -742,6 +883,16 @@ def callback(message):
              'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
              'If unspecified, the three above examples will be used.',
         default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'])
+    parser_bigquery.add_argument(
+        '--custom_dictionaries', action='append',
+        help='Strings representing comma-delimited lists of dictionary words'
+             ' to search for as custom info types.'
+        default=None)
+    parser_bigquery.add_argument(
+        '--custom_regexes', action='append',
+        help='Strings representing regex patterns to search for as custom '
+             ' info types.'
+        default=None)
     parser_bigquery.add_argument(
         '--min_likelihood',
         choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY',
@@ -762,12 +913,16 @@ def callback(message):
     if args.content == 'string':
         inspect_string(
             args.project, args.item, args.info_types,
+            custom_dictionaries=args.custom_dictionaries,
+            custom_regexes=args.custom_regexes,
             min_likelihood=args.min_likelihood,
             max_findings=args.max_findings,
             include_quote=args.include_quote)
     elif args.content == 'file':
         inspect_file(
             args.project, args.filename, args.info_types,
+            custom_dictionaries=args.custom_dictionaries,
+            custom_regexes=args.custom_regexes,
             min_likelihood=args.min_likelihood,
             max_findings=args.max_findings,
             include_quote=args.include_quote,
@@ -777,6 +932,8 @@ def callback(message):
             args.project, args.bucket, args.filename,
             args.topic_id, args.subscription_id,
             args.info_types,
+            custom_dictionaries=args.custom_dictionaries,
+            custom_regexes=args.custom_regexes,
             min_likelihood=args.min_likelihood,
             max_findings=args.max_findings,
             timeout=args.timeout)
@@ -785,6 +942,8 @@ def callback(message):
             args.project, args.datastore_project, args.kind,
             args.topic_id, args.subscription_id,
             args.info_types,
+            custom_dictionaries=args.custom_dictionaries,
+            custom_regexes=args.custom_regexes,
             namespace_id=args.namespace_id,
             min_likelihood=args.min_likelihood,
             max_findings=args.max_findings,
@@ -794,6 +953,8 @@ def callback(message):
             args.project, args.bigquery_project, args.dataset_id,
             args.table_id, args.topic_id, args.subscription_id,
             args.info_types,
+            custom_dictionaries=args.custom_dictionaries,
+            custom_regexes=args.custom_regexes,
             min_likelihood=args.min_likelihood,
             max_findings=args.max_findings,
             timeout=args.timeout)

From dcdec4422b48a5123a6016af011f3c0ac027ef95 Mon Sep 17 00:00:00 2001
From: mwdaub <mwdaub@gmail.com>
Date: Wed, 6 Jun 2018 15:53:24 -0700
Subject: [PATCH 02/17] Make code compatible with python 2.7

---
 dlp/inspect_content.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py
index a85befe733c..b94c3f38c78 100644
--- a/dlp/inspect_content.py
+++ b/dlp/inspect_content.py
@@ -16,6 +16,7 @@
 local file or a file on Google Cloud Storage."""
 
 from __future__ import print_function
+from builtins import range
 
 import argparse
 import os

From f2919eeae5ed5bc91459e5336f43e4e25af742ce Mon Sep 17 00:00:00 2001
From: mwdaub <mwdaub@gmail.com>
Date: Wed, 6 Jun 2018 16:03:57 -0700
Subject: [PATCH 03/17] Add missing commas

---
 dlp/inspect_content.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py
index b94c3f38c78..da7442d3af6 100644
--- a/dlp/inspect_content.py
+++ b/dlp/inspect_content.py
@@ -676,12 +676,12 @@ def callback(message):
     parser_string.add_argument(
         '--custom_dictionaries', action='append',
         help='Strings representing comma-delimited lists of dictionary words'
-             ' to search for as custom info types.'
+             ' to search for as custom info types.',
         default=None)
     parser_string.add_argument(
         '--custom_regexes', action='append',
         help='Strings representing regex patterns to search for as custom '
-             ' info types.'
+             ' info types.',
         default=None)
     parser_string.add_argument(
         '--min_likelihood',
@@ -715,12 +715,12 @@ def callback(message):
     parser_file.add_argument(
         '--custom_dictionaries', action='append',
         help='Strings representing comma-delimited lists of dictionary words'
-             ' to search for as custom info types.'
+             ' to search for as custom info types.',
         default=None)
     parser_file.add_argument(
         '--custom_regexes', action='append',
         help='Strings representing regex patterns to search for as custom '
-             ' info types.'
+             ' info types.',
         default=None)
     parser_file.add_argument(
         '--min_likelihood',
@@ -773,12 +773,12 @@ def callback(message):
     parser_gcs.add_argument(
         '--custom_dictionaries', action='append',
         help='Strings representing comma-delimited lists of dictionary words'
-             ' to search for as custom info types.'
+             ' to search for as custom info types.',
         default=None)
     parser_gcs.add_argument(
         '--custom_regexes', action='append',
         help='Strings representing regex patterns to search for as custom '
-             ' info types.'
+             ' info types.',
         default=None)
     parser_gcs.add_argument(
         '--min_likelihood',
@@ -827,12 +827,12 @@ def callback(message):
     parser_datastore.add_argument(
         '--custom_dictionaries', action='append',
         help='Strings representing comma-delimited lists of dictionary words'
-             ' to search for as custom info types.'
+             ' to search for as custom info types.',
         default=None)
     parser_datastore.add_argument(
         '--custom_regexes', action='append',
         help='Strings representing regex patterns to search for as custom '
-             ' info types.'
+             ' info types.',
         default=None)
     parser_datastore.add_argument(
         '--namespace_id',
@@ -887,12 +887,12 @@ def callback(message):
     parser_bigquery.add_argument(
         '--custom_dictionaries', action='append',
         help='Strings representing comma-delimited lists of dictionary words'
-             ' to search for as custom info types.'
+             ' to search for as custom info types.',
         default=None)
     parser_bigquery.add_argument(
         '--custom_regexes', action='append',
         help='Strings representing regex patterns to search for as custom '
-             ' info types.'
+             ' info types.',
         default=None)
     parser_bigquery.add_argument(
         '--min_likelihood',

From 2ddc9caf2cbdd5aae413cbc2456f477b89ec9d16 Mon Sep 17 00:00:00 2001
From: mwdaub <mwdaub@gmail.com>
Date: Wed, 13 Jun 2018 10:52:24 -0700
Subject: [PATCH 04/17] Remove bad import

---
 dlp/inspect_content.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py
index da7442d3af6..d09c0d1624d 100644
--- a/dlp/inspect_content.py
+++ b/dlp/inspect_content.py
@@ -16,7 +16,6 @@
 local file or a file on Google Cloud Storage."""
 
 from __future__ import print_function
-from builtins import range
 
 import argparse
 import os

From bd24d49b47ebc1066901da15e7534541d5d71244 Mon Sep 17 00:00:00 2001
From: mwdaub <mwdaub@gmail.com>
Date: Wed, 13 Jun 2018 11:06:13 -0700
Subject: [PATCH 05/17] Add tests for custom info types

---
 dlp/inspect_content_test.py | 52 +++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/dlp/inspect_content_test.py b/dlp/inspect_content_test.py
index 3fd7874478d..4137f865ca8 100644
--- a/dlp/inspect_content_test.py
+++ b/dlp/inspect_content_test.py
@@ -169,6 +169,23 @@ def test_inspect_string(capsys):
     assert 'Info type: FIRST_NAME' in out
     assert 'Info type: EMAIL_ADDRESS' in out
 
+def test_inspect_string_with_custom_info_types(capsys):
+    test_string = 'My name is Gary Smith and my email is gary@example.com'
+    dictionaries = ['Gary Smith']
+    regexes = ['\\w+@\\w+.com']
+
+    inspect_content.inspect_string(
+        GCLOUD_PROJECT,
+        test_string,
+        [],
+        custom_dictionaries=dictionaries,
+        custom_regexes=regexes,
+        include_quote=True)
+
+    out, _ = capsys.readouterr()
+    assert 'Info type: CUSTOM_DICTIONARY_0' in out
+    assert 'Info type: CUSTOM_REGEX_0' in out
+
 
 def test_inspect_string_no_results(capsys):
     test_string = 'Nothing to see here'
@@ -195,6 +212,23 @@ def test_inspect_file(capsys):
     out, _ = capsys.readouterr()
     assert 'Info type: EMAIL_ADDRESS' in out
 
+def test_inspect_file_with_custom_info_types(capsys):
+    test_filepath = os.path.join(RESOURCE_DIRECTORY, 'test.txt')
+    dictionaries = ['gary@somedomain.com']
+    regexes = ['\\(\\d{3}\\) \\d{3}-\\d{4}']
+
+    inspect_content.inspect_file(
+        GCLOUD_PROJECT,
+        test_filepath,
+        [],
+        custom_dictionaries=dictionaries,
+        custom_regexes=regexes,
+        include_quote=True)
+
+    out, _ = capsys.readouterr()
+    assert 'Info type: CUSTOM_DICTIONARY_0' in out
+    assert 'Info type: CUSTOM_REGEX_0' in out
+
 
 def test_inspect_file_no_results(capsys):
     test_filepath = os.path.join(RESOURCE_DIRECTORY, 'harmless.txt')
@@ -235,6 +269,24 @@ def test_inspect_gcs_file(bucket, topic_id, subscription_id, capsys):
     out, _ = capsys.readouterr()
     assert 'Info type: EMAIL_ADDRESS' in out
 
+@flaky
+def test_inspect_gcs_file_with_custom_info_types(bucket, topic_id, subscription_id, capsys):
+    dictionaries = ['gary@somedomain.com']
+    regexes = ['\\(\\d{3}\\) \\d{3}-\\d{4}']
+
+    inspect_content.inspect_gcs_file(
+        GCLOUD_PROJECT,
+        bucket.name,
+        'test.txt',
+        topic_id,
+        subscription_id,
+        [],
+        custom_dictionaries=dictionaries,
+        custom_regexes=regexes)
+
+    out, _ = capsys.readouterr()
+    assert 'Info type: CUSTOM_DICTIONARY_0' in out
+    assert 'Info type: CUSTOM_REGEX_0' in out
 
 @flaky
 def test_inspect_gcs_file_no_results(

From 5ecc9151168b81de44b51050b2fca41347ea0135 Mon Sep 17 00:00:00 2001
From: mwdaub <mwdaub@gmail.com>
Date: Wed, 13 Jun 2018 11:15:08 -0700
Subject: [PATCH 06/17] Add info_types parameter to deid.py

---
 dlp/deid.py | 39 +++++++++++++++++++++++++++++++++------
 1 file changed, 33 insertions(+), 6 deletions(-)

diff --git a/dlp/deid.py b/dlp/deid.py
index 55882faaa97..98b41488267 100644
--- a/dlp/deid.py
+++ b/dlp/deid.py
@@ -20,7 +20,7 @@
 
 
 # [START dlp_deidentify_masking]
-def deidentify_with_mask(project, string, masking_character=None,
+def deidentify_with_mask(project, string, info_types, masking_character=None,
                          number_to_mask=0):
     """Uses the Data Loss Prevention API to deidentify sensitive data in a
     string by masking it with a character.
@@ -44,6 +44,11 @@ def deidentify_with_mask(project, string, masking_character=None,
     # Convert the project id into a full resource id.
     parent = dlp.project_path(project)
 
+    # Construct inspect configuration dictionary
+    inspect_config = {
+        'info_types': [{'name': info_type} for info_type in info_types]
+    }
+
     # Construct deidentify configuration dictionary
     deidentify_config = {
         'info_type_transformations': {
@@ -65,7 +70,8 @@ def deidentify_with_mask(project, string, masking_character=None,
 
     # Call the API
     response = dlp.deidentify_content(
-        parent, deidentify_config=deidentify_config, item=item)
+        parent, inspect_config=inspect_config,
+        deidentify_config=deidentify_config, item=item)
 
     # Print out the results.
     print(response.item.value)
@@ -73,7 +79,7 @@ def deidentify_with_mask(project, string, masking_character=None,
 
 
 # [START dlp_deidentify_fpe]
-def deidentify_with_fpe(project, string, alphabet=None,
+def deidentify_with_fpe(project, string, info_types, alphabet=None,
                         surrogate_type=None, key_name=None, wrapped_key=None):
     """Uses the Data Loss Prevention API to deidentify sensitive data in a
     string using Format Preserving Encryption (FPE).
@@ -127,6 +133,11 @@ def deidentify_with_fpe(project, string, alphabet=None,
             'name': surrogate_type
         }
 
+    # Construct inspect configuration dictionary
+    inspect_config = {
+        'info_types': [{'name': info_type} for info_type in info_types]
+    }
+
     # Construct deidentify configuration dictionary
     deidentify_config = {
         'info_type_transformations': {
@@ -146,7 +157,8 @@ def deidentify_with_fpe(project, string, alphabet=None,
 
     # Call the API
     response = dlp.deidentify_content(
-        parent, deidentify_config=deidentify_config, item=item)
+        parent, inspect_config=inspect_config,
+        deidentify_config=deidentify_config, item=item)
 
     # Print results
     print(response.item.value)
@@ -404,6 +416,13 @@ def write_data(data):
         'deid_mask',
         help='Deidentify sensitive data in a string by masking it with a '
              'character.')
+    mask_parser.add_argument(
+        '--info_types', action='append',
+        help='Strings representing info types to look for. A full list of '
+             'info categories and types is available from the API. Examples '
+             'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
+             'If unspecified, the three above examples will be used.',
+        default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'])
     mask_parser.add_argument(
         'project',
         help='The Google Cloud project id to use as a parent resource.')
@@ -423,6 +442,13 @@ def write_data(data):
         'deid_fpe',
         help='Deidentify sensitive data in a string using Format Preserving '
              'Encryption (FPE).')
+    fpe_parser.add_argument(
+        '--info_types', action='append',
+        help='Strings representing info types to look for. A full list of '
+             'info categories and types is available from the API. Examples '
+             'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
+             'If unspecified, the three above examples will be used.',
+        default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'])
     fpe_parser.add_argument(
          'project',
          help='The Google Cloud project id to use as a parent resource.')
@@ -532,11 +558,12 @@ def write_data(data):
     args = parser.parse_args()
 
     if args.content == 'deid_mask':
-        deidentify_with_mask(args.project, args.item,
+        deidentify_with_mask(args.project, args.item, args.info_types,
                              masking_character=args.masking_character,
                              number_to_mask=args.number_to_mask)
     elif args.content == 'deid_fpe':
-        deidentify_with_fpe(args.project, args.item, alphabet=args.alphabet,
+        deidentify_with_fpe(args.project, args.item, args.info_types,
+                            alphabet=args.alphabet,
                             wrapped_key=args.wrapped_key,
                             key_name=args.key_name,
                             surrogate_type=args.surrogate_type)

From c80a2d95586956a984467e7853e2c28edef0d506 Mon Sep 17 00:00:00 2001
From: mwdaub <mwdaub@gmail.com>
Date: Wed, 13 Jun 2018 11:18:13 -0700
Subject: [PATCH 07/17] Update deid tests to use info_types parameter

---
 dlp/deid_test.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/dlp/deid_test.py b/dlp/deid_test.py
index 70e8290c067..f6bce36e79a 100644
--- a/dlp/deid_test.py
+++ b/dlp/deid_test.py
@@ -43,7 +43,8 @@ def tempdir():
 
 
 def test_deidentify_with_mask(capsys):
-    deid.deidentify_with_mask(GCLOUD_PROJECT, HARMFUL_STRING)
+    deid.deidentify_with_mask(GCLOUD_PROJECT, HARMFUL_STRING,
+                              ['US_SOCIAL_SECURITY_NUMBER'])
 
     out, _ = capsys.readouterr()
     assert 'My SSN is *********' in out
@@ -60,6 +61,7 @@ def test_deidentify_with_mask_masking_character_specified(capsys):
     deid.deidentify_with_mask(
         GCLOUD_PROJECT,
         HARMFUL_STRING,
+        ['US_SOCIAL_SECURITY_NUMBER'],
         masking_character='#')
 
     out, _ = capsys.readouterr()
@@ -67,7 +69,9 @@ def test_deidentify_with_mask_masking_character_specified(capsys):
 
 
 def test_deidentify_with_mask_masking_number_specified(capsys):
-    deid.deidentify_with_mask(GCLOUD_PROJECT, HARMFUL_STRING, number_to_mask=7)
+    deid.deidentify_with_mask(GCLOUD_PROJECT, HARMFUL_STRING,
+                              ['US_SOCIAL_SECURITY_NUMBER'],
+                              number_to_mask=7)
 
     out, _ = capsys.readouterr()
     assert 'My SSN is *******27' in out
@@ -77,6 +81,7 @@ def test_deidentify_with_fpe(capsys):
     deid.deidentify_with_fpe(
         GCLOUD_PROJECT,
         HARMFUL_STRING,
+        ['US_SOCIAL_SECURITY_NUMBER'],
         alphabet='NUMERIC',
         wrapped_key=WRAPPED_KEY,
         key_name=KEY_NAME)
@@ -90,6 +95,7 @@ def test_deidentify_with_fpe_uses_surrogate_info_types(capsys):
     deid.deidentify_with_fpe(
         GCLOUD_PROJECT,
         HARMFUL_STRING,
+        ['US_SOCIAL_SECURITY_NUMBER'],
         alphabet='NUMERIC',
         wrapped_key=WRAPPED_KEY,
         key_name=KEY_NAME,

From 019b5f7b55c6f1b1d964c0d2b9c0359a969fae9a Mon Sep 17 00:00:00 2001
From: mwdaub <mwdaub@gmail.com>
Date: Thu, 14 Jun 2018 10:45:04 -0700
Subject: [PATCH 08/17] Fix indentation

---
 dlp/inspect_content.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py
index d09c0d1624d..acdd7f3f274 100644
--- a/dlp/inspect_content.py
+++ b/dlp/inspect_content.py
@@ -54,7 +54,7 @@ def inspect_string(project, content_string, info_types,
     # Prepare custom_info_types by parsing the dictionary word lists and
     # regex patterns.
     if custom_dictionaries is None:
-      custom_dictionaries = []
+        custom_dictionaries = []
     dictionaries = [{
         'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
         'dictionary': {
@@ -62,7 +62,7 @@ def inspect_string(project, content_string, info_types,
         }
     } for i in range(len(custom_dictionaries))]
     if custom_regexes is None:
-      custom_regexes = []
+        custom_regexes = []
     regexes = [{
         'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
         'regex': {'pattern': custom_regexes[i]}
@@ -142,7 +142,7 @@ def inspect_file(project, filename, info_types, min_likelihood=None,
     # Prepare custom_info_types by parsing the dictionary word lists and
     # regex patterns.
     if custom_dictionaries is None:
-      custom_dictionaries = []
+        custom_dictionaries = []
     dictionaries = [{
         'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
         'dictionary': {
@@ -150,7 +150,7 @@ def inspect_file(project, filename, info_types, min_likelihood=None,
         }
     } for i in range(len(custom_dictionaries))]
     if custom_regexes is None:
-      custom_regexes = []
+        custom_regexes = []
     regexes = [{
         'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
         'regex': {'pattern': custom_regexes[i]}
@@ -255,7 +255,7 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id,
     # Prepare custom_info_types by parsing the dictionary word lists and
     # regex patterns.
     if custom_dictionaries is None:
-      custom_dictionaries = []
+        custom_dictionaries = []
     dictionaries = [{
         'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
         'dictionary': {
@@ -263,7 +263,7 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id,
         }
     } for i in range(len(custom_dictionaries))]
     if custom_regexes is None:
-      custom_regexes = []
+        custom_regexes = []
     regexes = [{
         'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
         'regex': {'pattern': custom_regexes[i]}
@@ -401,7 +401,7 @@ def inspect_datastore(project, datastore_project, kind,
     # Prepare custom_info_types by parsing the dictionary word lists and
     # regex patterns.
     if custom_dictionaries is None:
-      custom_dictionaries = []
+        custom_dictionaries = []
     dictionaries = [{
         'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
         'dictionary': {
@@ -409,7 +409,7 @@ def inspect_datastore(project, datastore_project, kind,
         }
     } for i in range(len(custom_dictionaries))]
     if custom_regexes is None:
-      custom_regexes = []
+        custom_regexes = []
     regexes = [{
         'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
         'regex': {'pattern': custom_regexes[i]}
@@ -552,7 +552,7 @@ def inspect_bigquery(project, bigquery_project, dataset_id, table_id,
     # Prepare custom_info_types by parsing the dictionary word lists and
     # regex patterns.
     if custom_dictionaries is None:
-      custom_dictionaries = []
+        custom_dictionaries = []
     dictionaries = [{
         'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
         'dictionary': {
@@ -560,7 +560,7 @@ def inspect_bigquery(project, bigquery_project, dataset_id, table_id,
         }
     } for i in range(len(custom_dictionaries))]
     if custom_regexes is None:
-      custom_regexes = []
+        custom_regexes = []
     regexes = [{
         'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
         'regex': {'pattern': custom_regexes[i]}

From 5258658bf7a372a26c79bdb418939d41ac914f09 Mon Sep 17 00:00:00 2001
From: mwdaub <mwdaub@gmail.com>
Date: Thu, 14 Jun 2018 10:46:24 -0700
Subject: [PATCH 09/17] Add blank lines

---
 dlp/inspect_content_test.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/dlp/inspect_content_test.py b/dlp/inspect_content_test.py
index 4137f865ca8..d8c9f98d456 100644
--- a/dlp/inspect_content_test.py
+++ b/dlp/inspect_content_test.py
@@ -169,6 +169,7 @@ def test_inspect_string(capsys):
     assert 'Info type: FIRST_NAME' in out
     assert 'Info type: EMAIL_ADDRESS' in out
 
+
 def test_inspect_string_with_custom_info_types(capsys):
     test_string = 'My name is Gary Smith and my email is gary@example.com'
     dictionaries = ['Gary Smith']
@@ -212,6 +213,7 @@ def test_inspect_file(capsys):
     out, _ = capsys.readouterr()
     assert 'Info type: EMAIL_ADDRESS' in out
 
+
 def test_inspect_file_with_custom_info_types(capsys):
     test_filepath = os.path.join(RESOURCE_DIRECTORY, 'test.txt')
     dictionaries = ['gary@somedomain.com']
@@ -269,6 +271,7 @@ def test_inspect_gcs_file(bucket, topic_id, subscription_id, capsys):
     out, _ = capsys.readouterr()
     assert 'Info type: EMAIL_ADDRESS' in out
 
+
 @flaky
 def test_inspect_gcs_file_with_custom_info_types(bucket, topic_id, subscription_id, capsys):
     dictionaries = ['gary@somedomain.com']
@@ -288,6 +291,7 @@ def test_inspect_gcs_file_with_custom_info_types(bucket, topic_id, subscription_
     assert 'Info type: CUSTOM_DICTIONARY_0' in out
     assert 'Info type: CUSTOM_REGEX_0' in out
 
+
 @flaky
 def test_inspect_gcs_file_no_results(
         bucket, topic_id, subscription_id, capsys):

From 47fc04f74c77db3bd5397459cf9242dc11521c37 Mon Sep 17 00:00:00 2001
From: mwdaub <mwdaub@gmail.com>
Date: Thu, 14 Jun 2018 10:51:15 -0700
Subject: [PATCH 10/17] Share logic for building custom info types

---
 dlp/inspect_content.py | 103 +++++++++++------------------------------
 1 file changed, 28 insertions(+), 75 deletions(-)

diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py
index acdd7f3f274..b2da99c4f5c 100644
--- a/dlp/inspect_content.py
+++ b/dlp/inspect_content.py
@@ -53,21 +53,8 @@ def inspect_string(project, content_string, info_types,
 
     # Prepare custom_info_types by parsing the dictionary word lists and
     # regex patterns.
-    if custom_dictionaries is None:
-        custom_dictionaries = []
-    dictionaries = [{
-        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
-        'dictionary': {
-            'word_list': {'words': custom_dictionaries[i].split(',')}
-        }
-    } for i in range(len(custom_dictionaries))]
-    if custom_regexes is None:
-        custom_regexes = []
-    regexes = [{
-        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
-        'regex': {'pattern': custom_regexes[i]}
-    } for i in range(len(custom_regexes))]
-    custom_info_types = dictionaries + regexes
+    custom_info_types = build_custom_info_types(custom_dictionaries,
+                                                custom_info_types)
 
     # Construct the configuration dictionary. Keys which are None may
     # optionally be omitted entirely.
@@ -141,21 +128,8 @@ def inspect_file(project, filename, info_types, min_likelihood=None,
 
     # Prepare custom_info_types by parsing the dictionary word lists and
     # regex patterns.
-    if custom_dictionaries is None:
-        custom_dictionaries = []
-    dictionaries = [{
-        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
-        'dictionary': {
-            'word_list': {'words': custom_dictionaries[i].split(',')}
-        }
-    } for i in range(len(custom_dictionaries))]
-    if custom_regexes is None:
-        custom_regexes = []
-    regexes = [{
-        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
-        'regex': {'pattern': custom_regexes[i]}
-    } for i in range(len(custom_regexes))]
-    custom_info_types = dictionaries + regexes
+    custom_info_types = build_custom_info_types(custom_dictionaries,
+                                                custom_regexes)
 
     # Construct the configuration dictionary. Keys which are None may
     # optionally be omitted entirely.
@@ -254,21 +228,8 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id,
 
     # Prepare custom_info_types by parsing the dictionary word lists and
     # regex patterns.
-    if custom_dictionaries is None:
-        custom_dictionaries = []
-    dictionaries = [{
-        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
-        'dictionary': {
-            'word_list': {'words': custom_dictionaries[i].split(',')}
-        }
-    } for i in range(len(custom_dictionaries))]
-    if custom_regexes is None:
-        custom_regexes = []
-    regexes = [{
-        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
-        'regex': {'pattern': custom_regexes[i]}
-    } for i in range(len(custom_regexes))]
-    custom_info_types = dictionaries + regexes
+    custom_info_types = build_custom_info_types(custom_dictionaries,
+                                                custom_regexes)
 
     # Construct the configuration dictionary. Keys which are None may
     # optionally be omitted entirely.
@@ -400,21 +361,8 @@ def inspect_datastore(project, datastore_project, kind,
 
     # Prepare custom_info_types by parsing the dictionary word lists and
     # regex patterns.
-    if custom_dictionaries is None:
-        custom_dictionaries = []
-    dictionaries = [{
-        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
-        'dictionary': {
-            'word_list': {'words': custom_dictionaries[i].split(',')}
-        }
-    } for i in range(len(custom_dictionaries))]
-    if custom_regexes is None:
-        custom_regexes = []
-    regexes = [{
-        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
-        'regex': {'pattern': custom_regexes[i]}
-    } for i in range(len(custom_regexes))]
-    custom_info_types = dictionaries + regexes
+    custom_info_types = build_custom_info_types(custom_dictionaries,
+                                                custom_regexes)
 
     # Construct the configuration dictionary. Keys which are None may
     # optionally be omitted entirely.
@@ -551,21 +499,8 @@ def inspect_bigquery(project, bigquery_project, dataset_id, table_id,
 
     # Prepare custom_info_types by parsing the dictionary word lists and
     # regex patterns.
-    if custom_dictionaries is None:
-        custom_dictionaries = []
-    dictionaries = [{
-        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
-        'dictionary': {
-            'word_list': {'words': custom_dictionaries[i].split(',')}
-        }
-    } for i in range(len(custom_dictionaries))]
-    if custom_regexes is None:
-        custom_regexes = []
-    regexes = [{
-        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
-        'regex': {'pattern': custom_regexes[i]}
-    } for i in range(len(custom_regexes))]
-    custom_info_types = dictionaries + regexes
+    custom_info_types = build_custom_info_types(custom_dictionaries,
+                                                custom_regexes)
 
     # Construct the configuration dictionary. Keys which are None may
     # optionally be omitted entirely.
@@ -651,6 +586,24 @@ def callback(message):
 # [END dlp_inspect_bigquery]
 
 
+def build_custom_info_types(custom_dictionaries, custom_regexes):
+    if custom_dictionaries is None:
+        custom_dictionaries = []
+    dictionaries = [{
+        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
+        'dictionary': {
+            'word_list': {'words': custom_dictionaries[i].split(',')}
+        }
+    } for i in range(len(custom_dictionaries))]
+    if custom_regexes is None:
+        custom_regexes = []
+    regexes = [{
+        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
+        'regex': {'pattern': custom_regexes[i]}
+    } for i in range(len(custom_regexes))]
+    return dictionaries + regexes
+
+
 if __name__ == '__main__':
     default_project = os.environ.get('GCLOUD_PROJECT')
 

From eb35add1561db92d614ac70f2a01c76e7f8f9655 Mon Sep 17 00:00:00 2001
From: mwdaub <mwdaub@gmail.com>
Date: Thu, 14 Jun 2018 10:52:07 -0700
Subject: [PATCH 11/17] Fix line too long

---
 dlp/inspect_content_test.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dlp/inspect_content_test.py b/dlp/inspect_content_test.py
index d8c9f98d456..db1a0074142 100644
--- a/dlp/inspect_content_test.py
+++ b/dlp/inspect_content_test.py
@@ -273,7 +273,8 @@ def test_inspect_gcs_file(bucket, topic_id, subscription_id, capsys):
 
 
 @flaky
-def test_inspect_gcs_file_with_custom_info_types(bucket, topic_id, subscription_id, capsys):
+def test_inspect_gcs_file_with_custom_info_types(bucket, topic_id,
+                                                 subscription_id, capsys):
     dictionaries = ['gary@somedomain.com']
     regexes = ['\\(\\d{3}\\) \\d{3}-\\d{4}']
 

From b4ffea6eef1fc2ccd2a4f17adb6e9492e54f1b76 Mon Sep 17 00:00:00 2001
From: mwdaub <mwdaub@gmail.com>
Date: Thu, 14 Jun 2018 13:06:14 -0700
Subject: [PATCH 12/17] Fix typo.

---
 dlp/inspect_content.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py
index b2da99c4f5c..7c3c47ed7d0 100644
--- a/dlp/inspect_content.py
+++ b/dlp/inspect_content.py
@@ -54,7 +54,7 @@ def inspect_string(project, content_string, info_types,
     # Prepare custom_info_types by parsing the dictionary word lists and
     # regex patterns.
     custom_info_types = build_custom_info_types(custom_dictionaries,
-                                                custom_info_types)
+                                                custom_regexes)
 
     # Construct the configuration dictionary. Keys which are None may
     # optionally be omitted entirely.

From 4f71b5b870bdc8ddd10c901a3f4358ff20f6e7e2 Mon Sep 17 00:00:00 2001
From: Michael Daub <mwdaub@google.com>
Date: Tue, 19 Jun 2018 09:08:17 -0700
Subject: [PATCH 13/17] Revert "Fix typo."

This reverts commit b4ffea6eef1fc2ccd2a4f17adb6e9492e54f1b76, so that
the sharing of the custom info type logic can be reverted as well to
make the code samples more readable.
---
 dlp/inspect_content.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py
index 7c3c47ed7d0..b2da99c4f5c 100644
--- a/dlp/inspect_content.py
+++ b/dlp/inspect_content.py
@@ -54,7 +54,7 @@ def inspect_string(project, content_string, info_types,
     # Prepare custom_info_types by parsing the dictionary word lists and
     # regex patterns.
     custom_info_types = build_custom_info_types(custom_dictionaries,
-                                                custom_regexes)
+                                                custom_info_types)
 
     # Construct the configuration dictionary. Keys which are None may
     # optionally be omitted entirely.

From 72a91529e4f353029b6654cfbdfac0af71415790 Mon Sep 17 00:00:00 2001
From: Michael Daub <mwdaub@google.com>
Date: Tue, 19 Jun 2018 09:09:23 -0700
Subject: [PATCH 14/17] Revert "Share logic for building custom info types"

This reverts commit 47fc04f74c77db3bd5397459cf9242dc11521c37. This makes
the code samples more readable.
---
 dlp/inspect_content.py | 103 ++++++++++++++++++++++++++++++-----------
 1 file changed, 75 insertions(+), 28 deletions(-)

diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py
index b2da99c4f5c..acdd7f3f274 100644
--- a/dlp/inspect_content.py
+++ b/dlp/inspect_content.py
@@ -53,8 +53,21 @@ def inspect_string(project, content_string, info_types,
 
     # Prepare custom_info_types by parsing the dictionary word lists and
     # regex patterns.
-    custom_info_types = build_custom_info_types(custom_dictionaries,
-                                                custom_info_types)
+    if custom_dictionaries is None:
+        custom_dictionaries = []
+    dictionaries = [{
+        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
+        'dictionary': {
+            'word_list': {'words': custom_dictionaries[i].split(',')}
+        }
+    } for i in range(len(custom_dictionaries))]
+    if custom_regexes is None:
+        custom_regexes = []
+    regexes = [{
+        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
+        'regex': {'pattern': custom_regexes[i]}
+    } for i in range(len(custom_regexes))]
+    custom_info_types = dictionaries + regexes
 
     # Construct the configuration dictionary. Keys which are None may
     # optionally be omitted entirely.
@@ -128,8 +141,21 @@ def inspect_file(project, filename, info_types, min_likelihood=None,
 
     # Prepare custom_info_types by parsing the dictionary word lists and
     # regex patterns.
-    custom_info_types = build_custom_info_types(custom_dictionaries,
-                                                custom_regexes)
+    if custom_dictionaries is None:
+        custom_dictionaries = []
+    dictionaries = [{
+        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
+        'dictionary': {
+            'word_list': {'words': custom_dictionaries[i].split(',')}
+        }
+    } for i in range(len(custom_dictionaries))]
+    if custom_regexes is None:
+        custom_regexes = []
+    regexes = [{
+        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
+        'regex': {'pattern': custom_regexes[i]}
+    } for i in range(len(custom_regexes))]
+    custom_info_types = dictionaries + regexes
 
     # Construct the configuration dictionary. Keys which are None may
     # optionally be omitted entirely.
@@ -228,8 +254,21 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id,
 
     # Prepare custom_info_types by parsing the dictionary word lists and
     # regex patterns.
-    custom_info_types = build_custom_info_types(custom_dictionaries,
-                                                custom_regexes)
+    if custom_dictionaries is None:
+        custom_dictionaries = []
+    dictionaries = [{
+        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
+        'dictionary': {
+            'word_list': {'words': custom_dictionaries[i].split(',')}
+        }
+    } for i in range(len(custom_dictionaries))]
+    if custom_regexes is None:
+        custom_regexes = []
+    regexes = [{
+        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
+        'regex': {'pattern': custom_regexes[i]}
+    } for i in range(len(custom_regexes))]
+    custom_info_types = dictionaries + regexes
 
     # Construct the configuration dictionary. Keys which are None may
     # optionally be omitted entirely.
@@ -361,8 +400,21 @@ def inspect_datastore(project, datastore_project, kind,
 
     # Prepare custom_info_types by parsing the dictionary word lists and
     # regex patterns.
-    custom_info_types = build_custom_info_types(custom_dictionaries,
-                                                custom_regexes)
+    if custom_dictionaries is None:
+        custom_dictionaries = []
+    dictionaries = [{
+        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
+        'dictionary': {
+            'word_list': {'words': custom_dictionaries[i].split(',')}
+        }
+    } for i in range(len(custom_dictionaries))]
+    if custom_regexes is None:
+        custom_regexes = []
+    regexes = [{
+        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
+        'regex': {'pattern': custom_regexes[i]}
+    } for i in range(len(custom_regexes))]
+    custom_info_types = dictionaries + regexes
 
     # Construct the configuration dictionary. Keys which are None may
     # optionally be omitted entirely.
@@ -499,8 +551,21 @@ def inspect_bigquery(project, bigquery_project, dataset_id, table_id,
 
     # Prepare custom_info_types by parsing the dictionary word lists and
     # regex patterns.
-    custom_info_types = build_custom_info_types(custom_dictionaries,
-                                                custom_regexes)
+    if custom_dictionaries is None:
+        custom_dictionaries = []
+    dictionaries = [{
+        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
+        'dictionary': {
+            'word_list': {'words': custom_dictionaries[i].split(',')}
+        }
+    } for i in range(len(custom_dictionaries))]
+    if custom_regexes is None:
+        custom_regexes = []
+    regexes = [{
+        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
+        'regex': {'pattern': custom_regexes[i]}
+    } for i in range(len(custom_regexes))]
+    custom_info_types = dictionaries + regexes
 
     # Construct the configuration dictionary. Keys which are None may
     # optionally be omitted entirely.
@@ -586,24 +651,6 @@ def callback(message):
 # [END dlp_inspect_bigquery]
 
 
-def build_custom_info_types(custom_dictionaries, custom_regexes):
-    if custom_dictionaries is None:
-        custom_dictionaries = []
-    dictionaries = [{
-        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
-        'dictionary': {
-            'word_list': {'words': custom_dictionaries[i].split(',')}
-        }
-    } for i in range(len(custom_dictionaries))]
-    if custom_regexes is None:
-        custom_regexes = []
-    regexes = [{
-        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
-        'regex': {'pattern': custom_regexes[i]}
-    } for i in range(len(custom_regexes))]
-    return dictionaries + regexes
-
-
 if __name__ == '__main__':
     default_project = os.environ.get('GCLOUD_PROJECT')
 

From 9384c1307adea7ab2eaaf6bc73ffce5d13fb7f22 Mon Sep 17 00:00:00 2001
From: Michael Daub <mwdaub@google.com>
Date: Mon, 2 Jul 2018 09:03:05 -0700
Subject: [PATCH 15/17] Switch from indexes to using enumerate.

---
 dlp/inspect_content.py | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py
index acdd7f3f274..0c2116f4e8b 100644
--- a/dlp/inspect_content.py
+++ b/dlp/inspect_content.py
@@ -58,15 +58,15 @@ def inspect_string(project, content_string, info_types,
     dictionaries = [{
         'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
         'dictionary': {
-            'word_list': {'words': custom_dictionaries[i].split(',')}
+            'word_list': {'words': custom_dict.split(',')}
         }
-    } for i in range(len(custom_dictionaries))]
+    } for i, custom_dict in enumerate(custom_dictionaries)]
     if custom_regexes is None:
         custom_regexes = []
     regexes = [{
         'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
-        'regex': {'pattern': custom_regexes[i]}
-    } for i in range(len(custom_regexes))]
+        'regex': {'pattern': custom_regex}
+    } for i in enumerate(custom_regexes)]
     custom_info_types = dictionaries + regexes
 
     # Construct the configuration dictionary. Keys which are None may
@@ -146,15 +146,15 @@ def inspect_file(project, filename, info_types, min_likelihood=None,
     dictionaries = [{
         'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
         'dictionary': {
-            'word_list': {'words': custom_dictionaries[i].split(',')}
+            'word_list': {'words': custom_dict.split(',')}
         }
-    } for i in range(len(custom_dictionaries))]
+    } for i, custom_dict in enumerate(custom_dictionaries)]
     if custom_regexes is None:
         custom_regexes = []
     regexes = [{
         'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
-        'regex': {'pattern': custom_regexes[i]}
-    } for i in range(len(custom_regexes))]
+        'regex': {'pattern': custom_regex}
+    } for i in enumerate(custom_regexes)]
     custom_info_types = dictionaries + regexes
 
     # Construct the configuration dictionary. Keys which are None may
@@ -259,15 +259,15 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id,
     dictionaries = [{
         'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
         'dictionary': {
-            'word_list': {'words': custom_dictionaries[i].split(',')}
+            'word_list': {'words': custom_dict.split(',')}
         }
-    } for i in range(len(custom_dictionaries))]
+    } for i, custom_dict in enumerate(custom_dictionaries)]
     if custom_regexes is None:
         custom_regexes = []
     regexes = [{
         'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
-        'regex': {'pattern': custom_regexes[i]}
-    } for i in range(len(custom_regexes))]
+        'regex': {'pattern': custom_regex}
+    } for i in enumerate(custom_regexes)]
     custom_info_types = dictionaries + regexes
 
     # Construct the configuration dictionary. Keys which are None may
@@ -405,15 +405,15 @@ def inspect_datastore(project, datastore_project, kind,
     dictionaries = [{
         'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
         'dictionary': {
-            'word_list': {'words': custom_dictionaries[i].split(',')}
+            'word_list': {'words': custom_dict.split(',')}
         }
-    } for i in range(len(custom_dictionaries))]
+    } for i, custom_dict in enumerate(custom_dictionaries)]
     if custom_regexes is None:
         custom_regexes = []
     regexes = [{
         'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
-        'regex': {'pattern': custom_regexes[i]}
-    } for i in range(len(custom_regexes))]
+        'regex': {'pattern': custom_regex}
+    } for i in enumerate(custom_regexes)]
     custom_info_types = dictionaries + regexes
 
     # Construct the configuration dictionary. Keys which are None may
@@ -556,15 +556,15 @@ def inspect_bigquery(project, bigquery_project, dataset_id, table_id,
     dictionaries = [{
         'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
         'dictionary': {
-            'word_list': {'words': custom_dictionaries[i].split(',')}
+            'word_list': {'words': custom_dict.split(',')}
         }
-    } for i in range(len(custom_dictionaries))]
+    } for i, custom_dict in enumerate(custom_dictionaries)]
     if custom_regexes is None:
         custom_regexes = []
     regexes = [{
         'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
-        'regex': {'pattern': custom_regexes[i]}
-    } for i in range(len(custom_regexes))]
+        'regex': {'pattern': custom_regex}
+    } for i in enumerate(custom_regexes)]
     custom_info_types = dictionaries + regexes
 
     # Construct the configuration dictionary. Keys which are None may

From b640d1901af8b7ca303f5c4e12606636374b1fe4 Mon Sep 17 00:00:00 2001
From: Michael Daub <mwdaub@google.com>
Date: Mon, 2 Jul 2018 09:21:15 -0700
Subject: [PATCH 16/17] Updated help message for custom dictionaries.

---
 dlp/inspect_content.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py
index 0c2116f4e8b..a2b606f66bc 100644
--- a/dlp/inspect_content.py
+++ b/dlp/inspect_content.py
@@ -675,7 +675,8 @@ def callback(message):
     parser_string.add_argument(
         '--custom_dictionaries', action='append',
         help='Strings representing comma-delimited lists of dictionary words'
-             ' to search for as custom info types.',
+             ' to search for as custom info types. Each string is a comma '
+             'delimited list of words representing a distinct dictionary.',
         default=None)
     parser_string.add_argument(
         '--custom_regexes', action='append',
@@ -714,7 +715,8 @@ def callback(message):
     parser_file.add_argument(
         '--custom_dictionaries', action='append',
         help='Strings representing comma-delimited lists of dictionary words'
-             ' to search for as custom info types.',
+             ' to search for as custom info types. Each string is a comma '
+             'delimited list of words representing a distinct dictionary.',
         default=None)
     parser_file.add_argument(
         '--custom_regexes', action='append',
@@ -772,7 +774,8 @@ def callback(message):
     parser_gcs.add_argument(
         '--custom_dictionaries', action='append',
         help='Strings representing comma-delimited lists of dictionary words'
-             ' to search for as custom info types.',
+             ' to search for as custom info types. Each string is a comma '
+             'delimited list of words representing a distinct dictionary.',
         default=None)
     parser_gcs.add_argument(
         '--custom_regexes', action='append',
@@ -826,7 +829,8 @@ def callback(message):
     parser_datastore.add_argument(
         '--custom_dictionaries', action='append',
         help='Strings representing comma-delimited lists of dictionary words'
-             ' to search for as custom info types.',
+             ' to search for as custom info types. Each string is a comma '
+             'delimited list of words representing a distinct dictionary.',
         default=None)
     parser_datastore.add_argument(
         '--custom_regexes', action='append',
@@ -886,7 +890,8 @@ def callback(message):
     parser_bigquery.add_argument(
         '--custom_dictionaries', action='append',
         help='Strings representing comma-delimited lists of dictionary words'
-             ' to search for as custom info types.',
+             ' to search for as custom info types. Each string is a comma '
+             'delimited list of words representing a distinct dictionary.',
         default=None)
     parser_bigquery.add_argument(
         '--custom_regexes', action='append',

From 08b4ccef20a53994fb392e94f935c7c5b2f8a66d Mon Sep 17 00:00:00 2001
From: mwdaub <mwdaub@gmail.com>
Date: Tue, 3 Jul 2018 08:54:13 -0700
Subject: [PATCH 17/17] Fix enumerate syntax error.

---
 dlp/inspect_content.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py
index a2b606f66bc..aedc002d465 100644
--- a/dlp/inspect_content.py
+++ b/dlp/inspect_content.py
@@ -66,7 +66,7 @@ def inspect_string(project, content_string, info_types,
     regexes = [{
         'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
         'regex': {'pattern': custom_regex}
-    } for i in enumerate(custom_regexes)]
+    } for i, custom_regex in enumerate(custom_regexes)]
     custom_info_types = dictionaries + regexes
 
     # Construct the configuration dictionary. Keys which are None may
@@ -154,7 +154,7 @@ def inspect_file(project, filename, info_types, min_likelihood=None,
     regexes = [{
         'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
         'regex': {'pattern': custom_regex}
-    } for i in enumerate(custom_regexes)]
+    } for i, custom_regex in enumerate(custom_regexes)]
     custom_info_types = dictionaries + regexes
 
     # Construct the configuration dictionary. Keys which are None may
@@ -267,7 +267,7 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id,
     regexes = [{
         'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
         'regex': {'pattern': custom_regex}
-    } for i in enumerate(custom_regexes)]
+    } for i, custom_regex in enumerate(custom_regexes)]
     custom_info_types = dictionaries + regexes
 
     # Construct the configuration dictionary. Keys which are None may
@@ -413,7 +413,7 @@ def inspect_datastore(project, datastore_project, kind,
     regexes = [{
         'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
         'regex': {'pattern': custom_regex}
-    } for i in enumerate(custom_regexes)]
+    } for i, custom_regex in enumerate(custom_regexes)]
     custom_info_types = dictionaries + regexes
 
     # Construct the configuration dictionary. Keys which are None may
@@ -564,7 +564,7 @@ def inspect_bigquery(project, bigquery_project, dataset_id, table_id,
     regexes = [{
         'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
         'regex': {'pattern': custom_regex}
-    } for i in enumerate(custom_regexes)]
+    } for i, custom_regex in enumerate(custom_regexes)]
     custom_info_types = dictionaries + regexes
 
     # Construct the configuration dictionary. Keys which are None may