From 8077cc7157d152379b7bb0fa884d72b66bd728d1 Mon Sep 17 00:00:00 2001 From: mwdaub Date: Wed, 6 Jun 2018 15:37:35 -0700 Subject: [PATCH 01/17] Add custom info type samples to inspect_content.py Use flags to indicate dictionary word lists and regex patterns, then parse them into custom info types. --- dlp/inspect_content.py | 169 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 165 insertions(+), 4 deletions(-) diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py index 3b2d5d4a60b..a85befe733c 100644 --- a/dlp/inspect_content.py +++ b/dlp/inspect_content.py @@ -23,6 +23,7 @@ # [START dlp_inspect_string] def inspect_string(project, content_string, info_types, + custom_dictionaries=None, custom_regexes=None, min_likelihood=None, max_findings=None, include_quote=True): """Uses the Data Loss Prevention API to analyze strings for protected data. Args: @@ -50,10 +51,29 @@ def inspect_string(project, content_string, info_types, # dictionaries (protos are also accepted). info_types = [{'name': info_type} for info_type in info_types] + # Prepare custom_info_types by parsing the dictionary word lists and + # regex patterns. + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [{ + 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, + 'dictionary': { + 'word_list': {'words': custom_dictionaries[i].split(',')} + } + } for i in range(len(custom_dictionaries))] + if custom_regexes is None: + custom_regexes = [] + regexes = [{ + 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, + 'regex': {'pattern': custom_regexes[i]} + } for i in range(len(custom_regexes))] + custom_info_types = dictionaries + regexes + # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { 'info_types': info_types, + 'custom_info_types': custom_info_types, 'min_likelihood': min_likelihood, 'include_quote': include_quote, 'limits': {'max_findings_per_request': max_findings}, @@ -85,6 +105,7 @@ def inspect_string(project, content_string, info_types, # [START dlp_inspect_file] def inspect_file(project, filename, info_types, min_likelihood=None, + custom_dictionaries=None, custom_regexes=None, max_findings=None, include_quote=True, mime_type=None): """Uses the Data Loss Prevention API to analyze a file for protected data. Args: @@ -118,10 +139,29 @@ def inspect_file(project, filename, info_types, min_likelihood=None, info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'] info_types = [{'name': info_type} for info_type in info_types] + # Prepare custom_info_types by parsing the dictionary word lists and + # regex patterns. + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [{ + 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, + 'dictionary': { + 'word_list': {'words': custom_dictionaries[i].split(',')} + } + } for i in range(len(custom_dictionaries))] + if custom_regexes is None: + custom_regexes = [] + regexes = [{ + 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, + 'regex': {'pattern': custom_regexes[i]} + } for i in range(len(custom_regexes))] + custom_info_types = dictionaries + regexes + # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { 'info_types': info_types, + 'custom_info_types': custom_info_types, 'min_likelihood': min_likelihood, 'limits': {'max_findings_per_request': max_findings}, } @@ -168,8 +208,9 @@ def inspect_file(project, filename, info_types, min_likelihood=None, # [START dlp_inspect_gcs] def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id, - info_types, min_likelihood=None, max_findings=None, - timeout=300): + info_types, custom_dictionaries=None, + custom_regexes=None, min_likelihood=None, + max_findings=None, timeout=300): """Uses the Data Loss Prevention API to analyze a file on GCS. Args: project: The Google Cloud project id to use as a parent resource. @@ -211,10 +252,29 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id, info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'] info_types = [{'name': info_type} for info_type in info_types] + # Prepare custom_info_types by parsing the dictionary word lists and + # regex patterns. + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [{ + 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, + 'dictionary': { + 'word_list': {'words': custom_dictionaries[i].split(',')} + } + } for i in range(len(custom_dictionaries))] + if custom_regexes is None: + custom_regexes = [] + regexes = [{ + 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, + 'regex': {'pattern': custom_regexes[i]} + } for i in range(len(custom_regexes))] + custom_info_types = dictionaries + regexes + # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { 'info_types': info_types, + 'custom_info_types': custom_info_types, 'min_likelihood': min_likelihood, 'limits': {'max_findings_per_request': max_findings}, } @@ -293,8 +353,10 @@ def callback(message): # [START dlp_inspect_datastore] def inspect_datastore(project, datastore_project, kind, - topic_id, subscription_id, info_types, namespace_id=None, - min_likelihood=None, max_findings=None, timeout=300): + topic_id, subscription_id, info_types, + custom_dictionaries=None, custom_regexes=None, + namespace_id=None, min_likelihood=None, + max_findings=None, timeout=300): """Uses the Data Loss Prevention API to analyze Datastore data. Args: project: The Google Cloud project id to use as a parent resource. @@ -336,10 +398,29 @@ def inspect_datastore(project, datastore_project, kind, info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'] info_types = [{'name': info_type} for info_type in info_types] + # Prepare custom_info_types by parsing the dictionary word lists and + # regex patterns. + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [{ + 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, + 'dictionary': { + 'word_list': {'words': custom_dictionaries[i].split(',')} + } + } for i in range(len(custom_dictionaries))] + if custom_regexes is None: + custom_regexes = [] + regexes = [{ + 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, + 'regex': {'pattern': custom_regexes[i]} + } for i in range(len(custom_regexes))] + custom_info_types = dictionaries + regexes + # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { 'info_types': info_types, + 'custom_info_types': custom_info_types, 'min_likelihood': min_likelihood, 'limits': {'max_findings_per_request': max_findings}, } @@ -424,6 +505,7 @@ def callback(message): # [START dlp_inspect_bigquery] def inspect_bigquery(project, bigquery_project, dataset_id, table_id, topic_id, subscription_id, info_types, + custom_dictionaries=None, custom_regexes=None, min_likelihood=None, max_findings=None, timeout=300): """Uses the Data Loss Prevention API to analyze BigQuery data. Args: @@ -467,10 +549,29 @@ def inspect_bigquery(project, bigquery_project, dataset_id, table_id, info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'] info_types = [{'name': info_type} for info_type in info_types] + # Prepare custom_info_types by parsing the dictionary word lists and + # regex patterns. + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [{ + 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, + 'dictionary': { + 'word_list': {'words': custom_dictionaries[i].split(',')} + } + } for i in range(len(custom_dictionaries))] + if custom_regexes is None: + custom_regexes = [] + regexes = [{ + 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, + 'regex': {'pattern': custom_regexes[i]} + } for i in range(len(custom_regexes))] + custom_info_types = dictionaries + regexes + # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. inspect_config = { 'info_types': info_types, + 'custom_info_types': custom_info_types, 'min_likelihood': min_likelihood, 'limits': {'max_findings_per_request': max_findings}, } @@ -571,6 +672,16 @@ def callback(message): 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' 'If unspecified, the three above examples will be used.', default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) + parser_string.add_argument( + '--custom_dictionaries', action='append', + help='Strings representing comma-delimited lists of dictionary words' + ' to search for as custom info types.' + default=None) + parser_string.add_argument( + '--custom_regexes', action='append', + help='Strings representing regex patterns to search for as custom ' + ' info types.' + default=None) parser_string.add_argument( '--min_likelihood', choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', @@ -600,6 +711,16 @@ def callback(message): 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' 'If unspecified, the three above examples will be used.', default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) + parser_file.add_argument( + '--custom_dictionaries', action='append', + help='Strings representing comma-delimited lists of dictionary words' + ' to search for as custom info types.' + default=None) + parser_file.add_argument( + '--custom_regexes', action='append', + help='Strings representing regex patterns to search for as custom ' + ' info types.' + default=None) parser_file.add_argument( '--min_likelihood', choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', @@ -648,6 +769,16 @@ def callback(message): 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' 'If unspecified, the three above examples will be used.', default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) + parser_gcs.add_argument( + '--custom_dictionaries', action='append', + help='Strings representing comma-delimited lists of dictionary words' + ' to search for as custom info types.' + default=None) + parser_gcs.add_argument( + '--custom_regexes', action='append', + help='Strings representing regex patterns to search for as custom ' + ' info types.' + default=None) parser_gcs.add_argument( '--min_likelihood', choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', @@ -692,6 +823,16 @@ def callback(message): 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' 'If unspecified, the three above examples will be used.', default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) + parser_datastore.add_argument( + '--custom_dictionaries', action='append', + help='Strings representing comma-delimited lists of dictionary words' + ' to search for as custom info types.' + default=None) + parser_datastore.add_argument( + '--custom_regexes', action='append', + help='Strings representing regex patterns to search for as custom ' + ' info types.' + default=None) parser_datastore.add_argument( '--namespace_id', help='The Datastore namespace to use, if applicable.') @@ -742,6 +883,16 @@ def callback(message): 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' 'If unspecified, the three above examples will be used.', default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) + parser_bigquery.add_argument( + '--custom_dictionaries', action='append', + help='Strings representing comma-delimited lists of dictionary words' + ' to search for as custom info types.' + default=None) + parser_bigquery.add_argument( + '--custom_regexes', action='append', + help='Strings representing regex patterns to search for as custom ' + ' info types.' + default=None) parser_bigquery.add_argument( '--min_likelihood', choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', @@ -762,12 +913,16 @@ def callback(message): if args.content == 'string': inspect_string( args.project, args.item, args.info_types, + custom_dictionaries=args.custom_dictionaries, + custom_regexes=args.custom_regexes, min_likelihood=args.min_likelihood, max_findings=args.max_findings, include_quote=args.include_quote) elif args.content == 'file': inspect_file( args.project, args.filename, args.info_types, + custom_dictionaries=args.custom_dictionaries, + custom_regexes=args.custom_regexes, min_likelihood=args.min_likelihood, max_findings=args.max_findings, include_quote=args.include_quote, @@ -777,6 +932,8 @@ def callback(message): args.project, args.bucket, args.filename, args.topic_id, args.subscription_id, args.info_types, + custom_dictionaries=args.custom_dictionaries, + custom_regexes=args.custom_regexes, min_likelihood=args.min_likelihood, max_findings=args.max_findings, timeout=args.timeout) @@ -785,6 +942,8 @@ def callback(message): args.project, args.datastore_project, args.kind, args.topic_id, args.subscription_id, args.info_types, + custom_dictionaries=args.custom_dictionaries, + custom_regexes=args.custom_regexes, namespace_id=args.namespace_id, min_likelihood=args.min_likelihood, max_findings=args.max_findings, @@ -794,6 +953,8 @@ def callback(message): args.project, args.bigquery_project, args.dataset_id, args.table_id, args.topic_id, args.subscription_id, args.info_types, + custom_dictionaries=args.custom_dictionaries, + custom_regexes=args.custom_regexes, min_likelihood=args.min_likelihood, max_findings=args.max_findings, timeout=args.timeout) From dcdec4422b48a5123a6016af011f3c0ac027ef95 Mon Sep 17 00:00:00 2001 From: mwdaub Date: Wed, 6 Jun 2018 15:53:24 -0700 Subject: [PATCH 02/17] Make code compatible with python 2.7 --- dlp/inspect_content.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py index a85befe733c..b94c3f38c78 100644 --- a/dlp/inspect_content.py +++ b/dlp/inspect_content.py @@ -16,6 +16,7 @@ local file or a file on Google Cloud Storage.""" from __future__ import print_function +from builtins import range import argparse import os From f2919eeae5ed5bc91459e5336f43e4e25af742ce Mon Sep 17 00:00:00 2001 From: mwdaub Date: Wed, 6 Jun 2018 16:03:57 -0700 Subject: [PATCH 03/17] Add missing commas --- dlp/inspect_content.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py index b94c3f38c78..da7442d3af6 100644 --- a/dlp/inspect_content.py +++ b/dlp/inspect_content.py @@ -676,12 +676,12 @@ def callback(message): parser_string.add_argument( '--custom_dictionaries', action='append', help='Strings representing comma-delimited lists of dictionary words' - ' to search for as custom info types.' + ' to search for as custom info types.', default=None) parser_string.add_argument( '--custom_regexes', action='append', help='Strings representing regex patterns to search for as custom ' - ' info types.' + ' info types.', default=None) parser_string.add_argument( '--min_likelihood', @@ -715,12 +715,12 @@ def callback(message): parser_file.add_argument( '--custom_dictionaries', action='append', help='Strings representing comma-delimited lists of dictionary words' - ' to search for as custom info types.' + ' to search for as custom info types.', default=None) parser_file.add_argument( '--custom_regexes', action='append', help='Strings representing regex patterns to search for as custom ' - ' info types.' + ' info types.', default=None) parser_file.add_argument( '--min_likelihood', @@ -773,12 +773,12 @@ def callback(message): parser_gcs.add_argument( '--custom_dictionaries', action='append', help='Strings representing comma-delimited lists of dictionary words' - ' to search for as custom info types.' + ' to search for as custom info types.', default=None) parser_gcs.add_argument( '--custom_regexes', action='append', help='Strings representing regex patterns to search for as custom ' - ' info types.' + ' info types.', default=None) parser_gcs.add_argument( '--min_likelihood', @@ -827,12 +827,12 @@ def callback(message): parser_datastore.add_argument( '--custom_dictionaries', action='append', help='Strings representing comma-delimited lists of dictionary words' - ' to search for as custom info types.' + ' to search for as custom info types.', default=None) parser_datastore.add_argument( '--custom_regexes', action='append', help='Strings representing regex patterns to search for as custom ' - ' info types.' + ' info types.', default=None) parser_datastore.add_argument( '--namespace_id', @@ -887,12 +887,12 @@ def callback(message): parser_bigquery.add_argument( '--custom_dictionaries', action='append', help='Strings representing comma-delimited lists of dictionary words' - ' to search for as custom info types.' + ' to search for as custom info types.', default=None) parser_bigquery.add_argument( '--custom_regexes', action='append', help='Strings representing regex patterns to search for as custom ' - ' info types.' + ' info types.', default=None) parser_bigquery.add_argument( '--min_likelihood', From 2ddc9caf2cbdd5aae413cbc2456f477b89ec9d16 Mon Sep 17 00:00:00 2001 From: mwdaub Date: Wed, 13 Jun 2018 10:52:24 -0700 Subject: [PATCH 04/17] Remove bad import --- dlp/inspect_content.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py index da7442d3af6..d09c0d1624d 100644 --- a/dlp/inspect_content.py +++ b/dlp/inspect_content.py @@ -16,7 +16,6 @@ local file or a file on Google Cloud Storage.""" from __future__ import print_function -from builtins import range import argparse import os From bd24d49b47ebc1066901da15e7534541d5d71244 Mon Sep 17 00:00:00 2001 From: mwdaub Date: Wed, 13 Jun 2018 11:06:13 -0700 Subject: [PATCH 05/17] Add tests for custom info types --- dlp/inspect_content_test.py | 52 +++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/dlp/inspect_content_test.py b/dlp/inspect_content_test.py index 3fd7874478d..4137f865ca8 100644 --- a/dlp/inspect_content_test.py +++ b/dlp/inspect_content_test.py @@ -169,6 +169,23 @@ def test_inspect_string(capsys): assert 'Info type: FIRST_NAME' in out assert 'Info type: EMAIL_ADDRESS' in out +def test_inspect_string_with_custom_info_types(capsys): + test_string = 'My name is Gary Smith and my email is gary@example.com' + dictionaries = ['Gary Smith'] + regexes = ['\\w+@\\w+.com'] + + inspect_content.inspect_string( + GCLOUD_PROJECT, + test_string, + [], + custom_dictionaries=dictionaries, + custom_regexes=regexes, + include_quote=True) + + out, _ = capsys.readouterr() + assert 'Info type: CUSTOM_DICTIONARY_0' in out + assert 'Info type: CUSTOM_REGEX_0' in out + def test_inspect_string_no_results(capsys): test_string = 'Nothing to see here' @@ -195,6 +212,23 @@ def test_inspect_file(capsys): out, _ = capsys.readouterr() assert 'Info type: EMAIL_ADDRESS' in out +def test_inspect_file_with_custom_info_types(capsys): + test_filepath = os.path.join(RESOURCE_DIRECTORY, 'test.txt') + dictionaries = ['gary@somedomain.com'] + regexes = ['\\(\\d{3}\\) \\d{3}-\\d{4}'] + + inspect_content.inspect_file( + GCLOUD_PROJECT, + test_filepath, + [], + custom_dictionaries=dictionaries, + custom_regexes=regexes, + include_quote=True) + + out, _ = capsys.readouterr() + assert 'Info type: CUSTOM_DICTIONARY_0' in out + assert 'Info type: CUSTOM_REGEX_0' in out + def test_inspect_file_no_results(capsys): test_filepath = os.path.join(RESOURCE_DIRECTORY, 'harmless.txt') @@ -235,6 +269,24 @@ def test_inspect_gcs_file(bucket, topic_id, subscription_id, capsys): out, _ = capsys.readouterr() assert 'Info type: EMAIL_ADDRESS' in out +@flaky +def test_inspect_gcs_file_with_custom_info_types(bucket, topic_id, subscription_id, capsys): + dictionaries = ['gary@somedomain.com'] + regexes = ['\\(\\d{3}\\) \\d{3}-\\d{4}'] + + inspect_content.inspect_gcs_file( + GCLOUD_PROJECT, + bucket.name, + 'test.txt', + topic_id, + subscription_id, + [], + custom_dictionaries=dictionaries, + custom_regexes=regexes) + + out, _ = capsys.readouterr() + assert 'Info type: CUSTOM_DICTIONARY_0' in out + assert 'Info type: CUSTOM_REGEX_0' in out @flaky def test_inspect_gcs_file_no_results( From 5ecc9151168b81de44b51050b2fca41347ea0135 Mon Sep 17 00:00:00 2001 From: mwdaub Date: Wed, 13 Jun 2018 11:15:08 -0700 Subject: [PATCH 06/17] Add info_types parameter to deid.py --- dlp/deid.py | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/dlp/deid.py b/dlp/deid.py index 55882faaa97..98b41488267 100644 --- a/dlp/deid.py +++ b/dlp/deid.py @@ -20,7 +20,7 @@ # [START dlp_deidentify_masking] -def deidentify_with_mask(project, string, masking_character=None, +def deidentify_with_mask(project, string, info_types, masking_character=None, number_to_mask=0): """Uses the Data Loss Prevention API to deidentify sensitive data in a string by masking it with a character. @@ -44,6 +44,11 @@ def deidentify_with_mask(project, string, masking_character=None, # Convert the project id into a full resource id. parent = dlp.project_path(project) + # Construct inspect configuration dictionary + inspect_config = { + 'info_types': [{'name': info_type} for info_type in info_types] + } + # Construct deidentify configuration dictionary deidentify_config = { 'info_type_transformations': { @@ -65,7 +70,8 @@ def deidentify_with_mask(project, string, masking_character=None, # Call the API response = dlp.deidentify_content( - parent, deidentify_config=deidentify_config, item=item) + parent, inspect_config=inspect_config, + deidentify_config=deidentify_config, item=item) # Print out the results. print(response.item.value) @@ -73,7 +79,7 @@ def deidentify_with_mask(project, string, masking_character=None, # [START dlp_deidentify_fpe] -def deidentify_with_fpe(project, string, alphabet=None, +def deidentify_with_fpe(project, string, info_types, alphabet=None, surrogate_type=None, key_name=None, wrapped_key=None): """Uses the Data Loss Prevention API to deidentify sensitive data in a string using Format Preserving Encryption (FPE). @@ -127,6 +133,11 @@ def deidentify_with_fpe(project, string, alphabet=None, 'name': surrogate_type } + # Construct inspect configuration dictionary + inspect_config = { + 'info_types': [{'name': info_type} for info_type in info_types] + } + # Construct deidentify configuration dictionary deidentify_config = { 'info_type_transformations': { @@ -146,7 +157,8 @@ def deidentify_with_fpe(project, string, alphabet=None, # Call the API response = dlp.deidentify_content( - parent, deidentify_config=deidentify_config, item=item) + parent, inspect_config=inspect_config, + deidentify_config=deidentify_config, item=item) # Print results print(response.item.value) @@ -404,6 +416,13 @@ def write_data(data): 'deid_mask', help='Deidentify sensitive data in a string by masking it with a ' 'character.') + mask_parser.add_argument( + '--info_types', action='append', + help='Strings representing info types to look for. A full list of ' + 'info categories and types is available from the API. Examples ' + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + 'If unspecified, the three above examples will be used.', + default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) mask_parser.add_argument( 'project', help='The Google Cloud project id to use as a parent resource.') @@ -423,6 +442,13 @@ def write_data(data): 'deid_fpe', help='Deidentify sensitive data in a string using Format Preserving ' 'Encryption (FPE).') + fpe_parser.add_argument( + '--info_types', action='append', + help='Strings representing info types to look for. A full list of ' + 'info categories and types is available from the API. Examples ' + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + 'If unspecified, the three above examples will be used.', + default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) fpe_parser.add_argument( 'project', help='The Google Cloud project id to use as a parent resource.') @@ -532,11 +558,12 @@ def write_data(data): args = parser.parse_args() if args.content == 'deid_mask': - deidentify_with_mask(args.project, args.item, + deidentify_with_mask(args.project, args.item, args.info_types, masking_character=args.masking_character, number_to_mask=args.number_to_mask) elif args.content == 'deid_fpe': - deidentify_with_fpe(args.project, args.item, alphabet=args.alphabet, + deidentify_with_fpe(args.project, args.item, args.info_types, + alphabet=args.alphabet, wrapped_key=args.wrapped_key, key_name=args.key_name, surrogate_type=args.surrogate_type) From c80a2d95586956a984467e7853e2c28edef0d506 Mon Sep 17 00:00:00 2001 From: mwdaub Date: Wed, 13 Jun 2018 11:18:13 -0700 Subject: [PATCH 07/17] Update deid tests to use info_types parameter --- dlp/deid_test.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/dlp/deid_test.py b/dlp/deid_test.py index 70e8290c067..f6bce36e79a 100644 --- a/dlp/deid_test.py +++ b/dlp/deid_test.py @@ -43,7 +43,8 @@ def tempdir(): def test_deidentify_with_mask(capsys): - deid.deidentify_with_mask(GCLOUD_PROJECT, HARMFUL_STRING) + deid.deidentify_with_mask(GCLOUD_PROJECT, HARMFUL_STRING, + ['US_SOCIAL_SECURITY_NUMBER']) out, _ = capsys.readouterr() assert 'My SSN is *********' in out @@ -60,6 +61,7 @@ def test_deidentify_with_mask_masking_character_specified(capsys): deid.deidentify_with_mask( GCLOUD_PROJECT, HARMFUL_STRING, + ['US_SOCIAL_SECURITY_NUMBER'], masking_character='#') out, _ = capsys.readouterr() @@ -67,7 +69,9 @@ def test_deidentify_with_mask_masking_character_specified(capsys): def test_deidentify_with_mask_masking_number_specified(capsys): - deid.deidentify_with_mask(GCLOUD_PROJECT, HARMFUL_STRING, number_to_mask=7) + deid.deidentify_with_mask(GCLOUD_PROJECT, HARMFUL_STRING, + ['US_SOCIAL_SECURITY_NUMBER'], + number_to_mask=7) out, _ = capsys.readouterr() assert 'My SSN is *******27' in out @@ -77,6 +81,7 @@ def test_deidentify_with_fpe(capsys): deid.deidentify_with_fpe( GCLOUD_PROJECT, HARMFUL_STRING, + ['US_SOCIAL_SECURITY_NUMBER'], alphabet='NUMERIC', wrapped_key=WRAPPED_KEY, key_name=KEY_NAME) @@ -90,6 +95,7 @@ def test_deidentify_with_fpe_uses_surrogate_info_types(capsys): deid.deidentify_with_fpe( GCLOUD_PROJECT, HARMFUL_STRING, + ['US_SOCIAL_SECURITY_NUMBER'], alphabet='NUMERIC', wrapped_key=WRAPPED_KEY, key_name=KEY_NAME, From 019b5f7b55c6f1b1d964c0d2b9c0359a969fae9a Mon Sep 17 00:00:00 2001 From: mwdaub Date: Thu, 14 Jun 2018 10:45:04 -0700 Subject: [PATCH 08/17] Fix indentation --- dlp/inspect_content.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py index d09c0d1624d..acdd7f3f274 100644 --- a/dlp/inspect_content.py +++ b/dlp/inspect_content.py @@ -54,7 +54,7 @@ def inspect_string(project, content_string, info_types, # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. if custom_dictionaries is None: - custom_dictionaries = [] + custom_dictionaries = [] dictionaries = [{ 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, 'dictionary': { @@ -62,7 +62,7 @@ def inspect_string(project, content_string, info_types, } } for i in range(len(custom_dictionaries))] if custom_regexes is None: - custom_regexes = [] + custom_regexes = [] regexes = [{ 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, 'regex': {'pattern': custom_regexes[i]} @@ -142,7 +142,7 @@ def inspect_file(project, filename, info_types, min_likelihood=None, # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. if custom_dictionaries is None: - custom_dictionaries = [] + custom_dictionaries = [] dictionaries = [{ 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, 'dictionary': { @@ -150,7 +150,7 @@ def inspect_file(project, filename, info_types, min_likelihood=None, } } for i in range(len(custom_dictionaries))] if custom_regexes is None: - custom_regexes = [] + custom_regexes = [] regexes = [{ 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, 'regex': {'pattern': custom_regexes[i]} @@ -255,7 +255,7 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id, # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. if custom_dictionaries is None: - custom_dictionaries = [] + custom_dictionaries = [] dictionaries = [{ 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, 'dictionary': { @@ -263,7 +263,7 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id, } } for i in range(len(custom_dictionaries))] if custom_regexes is None: - custom_regexes = [] + custom_regexes = [] regexes = [{ 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, 'regex': {'pattern': custom_regexes[i]} @@ -401,7 +401,7 @@ def inspect_datastore(project, datastore_project, kind, # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. if custom_dictionaries is None: - custom_dictionaries = [] + custom_dictionaries = [] dictionaries = [{ 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, 'dictionary': { @@ -409,7 +409,7 @@ def inspect_datastore(project, datastore_project, kind, } } for i in range(len(custom_dictionaries))] if custom_regexes is None: - custom_regexes = [] + custom_regexes = [] regexes = [{ 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, 'regex': {'pattern': custom_regexes[i]} @@ -552,7 +552,7 @@ def inspect_bigquery(project, bigquery_project, dataset_id, table_id, # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. if custom_dictionaries is None: - custom_dictionaries = [] + custom_dictionaries = [] dictionaries = [{ 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, 'dictionary': { @@ -560,7 +560,7 @@ def inspect_bigquery(project, bigquery_project, dataset_id, table_id, } } for i in range(len(custom_dictionaries))] if custom_regexes is None: - custom_regexes = [] + custom_regexes = [] regexes = [{ 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, 'regex': {'pattern': custom_regexes[i]} From 5258658bf7a372a26c79bdb418939d41ac914f09 Mon Sep 17 00:00:00 2001 From: mwdaub Date: Thu, 14 Jun 2018 10:46:24 -0700 Subject: [PATCH 09/17] Add blank lines --- dlp/inspect_content_test.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dlp/inspect_content_test.py b/dlp/inspect_content_test.py index 4137f865ca8..d8c9f98d456 100644 --- a/dlp/inspect_content_test.py +++ b/dlp/inspect_content_test.py @@ -169,6 +169,7 @@ def test_inspect_string(capsys): assert 'Info type: FIRST_NAME' in out assert 'Info type: EMAIL_ADDRESS' in out + def test_inspect_string_with_custom_info_types(capsys): test_string = 'My name is Gary Smith and my email is gary@example.com' dictionaries = ['Gary Smith'] @@ -212,6 +213,7 @@ def test_inspect_file(capsys): out, _ = capsys.readouterr() assert 'Info type: EMAIL_ADDRESS' in out + def test_inspect_file_with_custom_info_types(capsys): test_filepath = os.path.join(RESOURCE_DIRECTORY, 'test.txt') dictionaries = ['gary@somedomain.com'] @@ -269,6 +271,7 @@ def test_inspect_gcs_file(bucket, topic_id, subscription_id, capsys): out, _ = capsys.readouterr() assert 'Info type: EMAIL_ADDRESS' in out + @flaky def test_inspect_gcs_file_with_custom_info_types(bucket, topic_id, subscription_id, capsys): dictionaries = ['gary@somedomain.com'] @@ -288,6 +291,7 @@ def test_inspect_gcs_file_with_custom_info_types(bucket, topic_id, subscription_ assert 'Info type: CUSTOM_DICTIONARY_0' in out assert 'Info type: CUSTOM_REGEX_0' in out + @flaky def test_inspect_gcs_file_no_results( bucket, topic_id, subscription_id, capsys): From 47fc04f74c77db3bd5397459cf9242dc11521c37 Mon Sep 17 00:00:00 2001 From: mwdaub Date: Thu, 14 Jun 2018 10:51:15 -0700 Subject: [PATCH 10/17] Share logic for building custom info types --- dlp/inspect_content.py | 103 +++++++++++------------------------------ 1 file changed, 28 insertions(+), 75 deletions(-) diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py index acdd7f3f274..b2da99c4f5c 100644 --- a/dlp/inspect_content.py +++ b/dlp/inspect_content.py @@ -53,21 +53,8 @@ def inspect_string(project, content_string, info_types, # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. - if custom_dictionaries is None: - custom_dictionaries = [] - dictionaries = [{ - 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, - 'dictionary': { - 'word_list': {'words': custom_dictionaries[i].split(',')} - } - } for i in range(len(custom_dictionaries))] - if custom_regexes is None: - custom_regexes = [] - regexes = [{ - 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, - 'regex': {'pattern': custom_regexes[i]} - } for i in range(len(custom_regexes))] - custom_info_types = dictionaries + regexes + custom_info_types = build_custom_info_types(custom_dictionaries, + custom_info_types) # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. @@ -141,21 +128,8 @@ def inspect_file(project, filename, info_types, min_likelihood=None, # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. - if custom_dictionaries is None: - custom_dictionaries = [] - dictionaries = [{ - 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, - 'dictionary': { - 'word_list': {'words': custom_dictionaries[i].split(',')} - } - } for i in range(len(custom_dictionaries))] - if custom_regexes is None: - custom_regexes = [] - regexes = [{ - 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, - 'regex': {'pattern': custom_regexes[i]} - } for i in range(len(custom_regexes))] - custom_info_types = dictionaries + regexes + custom_info_types = build_custom_info_types(custom_dictionaries, + custom_regexes) # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. @@ -254,21 +228,8 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id, # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. - if custom_dictionaries is None: - custom_dictionaries = [] - dictionaries = [{ - 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, - 'dictionary': { - 'word_list': {'words': custom_dictionaries[i].split(',')} - } - } for i in range(len(custom_dictionaries))] - if custom_regexes is None: - custom_regexes = [] - regexes = [{ - 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, - 'regex': {'pattern': custom_regexes[i]} - } for i in range(len(custom_regexes))] - custom_info_types = dictionaries + regexes + custom_info_types = build_custom_info_types(custom_dictionaries, + custom_regexes) # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. @@ -400,21 +361,8 @@ def inspect_datastore(project, datastore_project, kind, # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. - if custom_dictionaries is None: - custom_dictionaries = [] - dictionaries = [{ - 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, - 'dictionary': { - 'word_list': {'words': custom_dictionaries[i].split(',')} - } - } for i in range(len(custom_dictionaries))] - if custom_regexes is None: - custom_regexes = [] - regexes = [{ - 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, - 'regex': {'pattern': custom_regexes[i]} - } for i in range(len(custom_regexes))] - custom_info_types = dictionaries + regexes + custom_info_types = build_custom_info_types(custom_dictionaries, + custom_regexes) # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. @@ -551,21 +499,8 @@ def inspect_bigquery(project, bigquery_project, dataset_id, table_id, # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. - if custom_dictionaries is None: - custom_dictionaries = [] - dictionaries = [{ - 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, - 'dictionary': { - 'word_list': {'words': custom_dictionaries[i].split(',')} - } - } for i in range(len(custom_dictionaries))] - if custom_regexes is None: - custom_regexes = [] - regexes = [{ - 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, - 'regex': {'pattern': custom_regexes[i]} - } for i in range(len(custom_regexes))] - custom_info_types = dictionaries + regexes + custom_info_types = build_custom_info_types(custom_dictionaries, + custom_regexes) # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. @@ -651,6 +586,24 @@ def callback(message): # [END dlp_inspect_bigquery] +def build_custom_info_types(custom_dictionaries, custom_regexes): + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [{ + 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, + 'dictionary': { + 'word_list': {'words': custom_dictionaries[i].split(',')} + } + } for i in range(len(custom_dictionaries))] + if custom_regexes is None: + custom_regexes = [] + regexes = [{ + 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, + 'regex': {'pattern': custom_regexes[i]} + } for i in range(len(custom_regexes))] + return dictionaries + regexes + + if __name__ == '__main__': default_project = os.environ.get('GCLOUD_PROJECT') From eb35add1561db92d614ac70f2a01c76e7f8f9655 Mon Sep 17 00:00:00 2001 From: mwdaub Date: Thu, 14 Jun 2018 10:52:07 -0700 Subject: [PATCH 11/17] Fix line too long --- dlp/inspect_content_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dlp/inspect_content_test.py b/dlp/inspect_content_test.py index d8c9f98d456..db1a0074142 100644 --- a/dlp/inspect_content_test.py +++ b/dlp/inspect_content_test.py @@ -273,7 +273,8 @@ def test_inspect_gcs_file(bucket, topic_id, subscription_id, capsys): @flaky -def test_inspect_gcs_file_with_custom_info_types(bucket, topic_id, subscription_id, capsys): +def test_inspect_gcs_file_with_custom_info_types(bucket, topic_id, + subscription_id, capsys): dictionaries = ['gary@somedomain.com'] regexes = ['\\(\\d{3}\\) \\d{3}-\\d{4}'] From b4ffea6eef1fc2ccd2a4f17adb6e9492e54f1b76 Mon Sep 17 00:00:00 2001 From: mwdaub Date: Thu, 14 Jun 2018 13:06:14 -0700 Subject: [PATCH 12/17] Fix typo. --- dlp/inspect_content.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py index b2da99c4f5c..7c3c47ed7d0 100644 --- a/dlp/inspect_content.py +++ b/dlp/inspect_content.py @@ -54,7 +54,7 @@ def inspect_string(project, content_string, info_types, # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. custom_info_types = build_custom_info_types(custom_dictionaries, - custom_info_types) + custom_regexes) # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. From 4f71b5b870bdc8ddd10c901a3f4358ff20f6e7e2 Mon Sep 17 00:00:00 2001 From: Michael Daub Date: Tue, 19 Jun 2018 09:08:17 -0700 Subject: [PATCH 13/17] Revert "Fix typo." This reverts commit b4ffea6eef1fc2ccd2a4f17adb6e9492e54f1b76, so that the sharing of the custom info type logic can be reverted as well to make the code samples more readable. --- dlp/inspect_content.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py index 7c3c47ed7d0..b2da99c4f5c 100644 --- a/dlp/inspect_content.py +++ b/dlp/inspect_content.py @@ -54,7 +54,7 @@ def inspect_string(project, content_string, info_types, # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. custom_info_types = build_custom_info_types(custom_dictionaries, - custom_regexes) + custom_info_types) # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. From 72a91529e4f353029b6654cfbdfac0af71415790 Mon Sep 17 00:00:00 2001 From: Michael Daub Date: Tue, 19 Jun 2018 09:09:23 -0700 Subject: [PATCH 14/17] Revert "Share logic for building custom info types" This reverts commit 47fc04f74c77db3bd5397459cf9242dc11521c37. This makes the code samples more readable. --- dlp/inspect_content.py | 103 ++++++++++++++++++++++++++++++----------- 1 file changed, 75 insertions(+), 28 deletions(-) diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py index b2da99c4f5c..acdd7f3f274 100644 --- a/dlp/inspect_content.py +++ b/dlp/inspect_content.py @@ -53,8 +53,21 @@ def inspect_string(project, content_string, info_types, # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. - custom_info_types = build_custom_info_types(custom_dictionaries, - custom_info_types) + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [{ + 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, + 'dictionary': { + 'word_list': {'words': custom_dictionaries[i].split(',')} + } + } for i in range(len(custom_dictionaries))] + if custom_regexes is None: + custom_regexes = [] + regexes = [{ + 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, + 'regex': {'pattern': custom_regexes[i]} + } for i in range(len(custom_regexes))] + custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. @@ -128,8 +141,21 @@ def inspect_file(project, filename, info_types, min_likelihood=None, # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. - custom_info_types = build_custom_info_types(custom_dictionaries, - custom_regexes) + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [{ + 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, + 'dictionary': { + 'word_list': {'words': custom_dictionaries[i].split(',')} + } + } for i in range(len(custom_dictionaries))] + if custom_regexes is None: + custom_regexes = [] + regexes = [{ + 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, + 'regex': {'pattern': custom_regexes[i]} + } for i in range(len(custom_regexes))] + custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. @@ -228,8 +254,21 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id, # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. - custom_info_types = build_custom_info_types(custom_dictionaries, - custom_regexes) + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [{ + 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, + 'dictionary': { + 'word_list': {'words': custom_dictionaries[i].split(',')} + } + } for i in range(len(custom_dictionaries))] + if custom_regexes is None: + custom_regexes = [] + regexes = [{ + 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, + 'regex': {'pattern': custom_regexes[i]} + } for i in range(len(custom_regexes))] + custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. @@ -361,8 +400,21 @@ def inspect_datastore(project, datastore_project, kind, # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. - custom_info_types = build_custom_info_types(custom_dictionaries, - custom_regexes) + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [{ + 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, + 'dictionary': { + 'word_list': {'words': custom_dictionaries[i].split(',')} + } + } for i in range(len(custom_dictionaries))] + if custom_regexes is None: + custom_regexes = [] + regexes = [{ + 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, + 'regex': {'pattern': custom_regexes[i]} + } for i in range(len(custom_regexes))] + custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. @@ -499,8 +551,21 @@ def inspect_bigquery(project, bigquery_project, dataset_id, table_id, # Prepare custom_info_types by parsing the dictionary word lists and # regex patterns. - custom_info_types = build_custom_info_types(custom_dictionaries, - custom_regexes) + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [{ + 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, + 'dictionary': { + 'word_list': {'words': custom_dictionaries[i].split(',')} + } + } for i in range(len(custom_dictionaries))] + if custom_regexes is None: + custom_regexes = [] + regexes = [{ + 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, + 'regex': {'pattern': custom_regexes[i]} + } for i in range(len(custom_regexes))] + custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may # optionally be omitted entirely. @@ -586,24 +651,6 @@ def callback(message): # [END dlp_inspect_bigquery] -def build_custom_info_types(custom_dictionaries, custom_regexes): - if custom_dictionaries is None: - custom_dictionaries = [] - dictionaries = [{ - 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, - 'dictionary': { - 'word_list': {'words': custom_dictionaries[i].split(',')} - } - } for i in range(len(custom_dictionaries))] - if custom_regexes is None: - custom_regexes = [] - regexes = [{ - 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, - 'regex': {'pattern': custom_regexes[i]} - } for i in range(len(custom_regexes))] - return dictionaries + regexes - - if __name__ == '__main__': default_project = os.environ.get('GCLOUD_PROJECT') From 9384c1307adea7ab2eaaf6bc73ffce5d13fb7f22 Mon Sep 17 00:00:00 2001 From: Michael Daub Date: Mon, 2 Jul 2018 09:03:05 -0700 Subject: [PATCH 15/17] Switch from indexes to using enumerate. --- dlp/inspect_content.py | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py index acdd7f3f274..0c2116f4e8b 100644 --- a/dlp/inspect_content.py +++ b/dlp/inspect_content.py @@ -58,15 +58,15 @@ def inspect_string(project, content_string, info_types, dictionaries = [{ 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, 'dictionary': { - 'word_list': {'words': custom_dictionaries[i].split(',')} + 'word_list': {'words': custom_dict.split(',')} } - } for i in range(len(custom_dictionaries))] + } for i, custom_dict in enumerate(custom_dictionaries)] if custom_regexes is None: custom_regexes = [] regexes = [{ 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, - 'regex': {'pattern': custom_regexes[i]} - } for i in range(len(custom_regexes))] + 'regex': {'pattern': custom_regex} + } for i in enumerate(custom_regexes)] custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may @@ -146,15 +146,15 @@ def inspect_file(project, filename, info_types, min_likelihood=None, dictionaries = [{ 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, 'dictionary': { - 'word_list': {'words': custom_dictionaries[i].split(',')} + 'word_list': {'words': custom_dict.split(',')} } - } for i in range(len(custom_dictionaries))] + } for i, custom_dict in enumerate(custom_dictionaries)] if custom_regexes is None: custom_regexes = [] regexes = [{ 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, - 'regex': {'pattern': custom_regexes[i]} - } for i in range(len(custom_regexes))] + 'regex': {'pattern': custom_regex} + } for i in enumerate(custom_regexes)] custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may @@ -259,15 +259,15 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id, dictionaries = [{ 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, 'dictionary': { - 'word_list': {'words': custom_dictionaries[i].split(',')} + 'word_list': {'words': custom_dict.split(',')} } - } for i in range(len(custom_dictionaries))] + } for i, custom_dict in enumerate(custom_dictionaries)] if custom_regexes is None: custom_regexes = [] regexes = [{ 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, - 'regex': {'pattern': custom_regexes[i]} - } for i in range(len(custom_regexes))] + 'regex': {'pattern': custom_regex} + } for i in enumerate(custom_regexes)] custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may @@ -405,15 +405,15 @@ def inspect_datastore(project, datastore_project, kind, dictionaries = [{ 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, 'dictionary': { - 'word_list': {'words': custom_dictionaries[i].split(',')} + 'word_list': {'words': custom_dict.split(',')} } - } for i in range(len(custom_dictionaries))] + } for i, custom_dict in enumerate(custom_dictionaries)] if custom_regexes is None: custom_regexes = [] regexes = [{ 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, - 'regex': {'pattern': custom_regexes[i]} - } for i in range(len(custom_regexes))] + 'regex': {'pattern': custom_regex} + } for i in enumerate(custom_regexes)] custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may @@ -556,15 +556,15 @@ def inspect_bigquery(project, bigquery_project, dataset_id, table_id, dictionaries = [{ 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, 'dictionary': { - 'word_list': {'words': custom_dictionaries[i].split(',')} + 'word_list': {'words': custom_dict.split(',')} } - } for i in range(len(custom_dictionaries))] + } for i, custom_dict in enumerate(custom_dictionaries)] if custom_regexes is None: custom_regexes = [] regexes = [{ 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, - 'regex': {'pattern': custom_regexes[i]} - } for i in range(len(custom_regexes))] + 'regex': {'pattern': custom_regex} + } for i in enumerate(custom_regexes)] custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may From b640d1901af8b7ca303f5c4e12606636374b1fe4 Mon Sep 17 00:00:00 2001 From: Michael Daub Date: Mon, 2 Jul 2018 09:21:15 -0700 Subject: [PATCH 16/17] Updated help message for custom dictionaries. --- dlp/inspect_content.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py index 0c2116f4e8b..a2b606f66bc 100644 --- a/dlp/inspect_content.py +++ b/dlp/inspect_content.py @@ -675,7 +675,8 @@ def callback(message): parser_string.add_argument( '--custom_dictionaries', action='append', help='Strings representing comma-delimited lists of dictionary words' - ' to search for as custom info types.', + ' to search for as custom info types. Each string is a comma ' + 'delimited list of words representing a distinct dictionary.', default=None) parser_string.add_argument( '--custom_regexes', action='append', @@ -714,7 +715,8 @@ def callback(message): parser_file.add_argument( '--custom_dictionaries', action='append', help='Strings representing comma-delimited lists of dictionary words' - ' to search for as custom info types.', + ' to search for as custom info types. Each string is a comma ' + 'delimited list of words representing a distinct dictionary.', default=None) parser_file.add_argument( '--custom_regexes', action='append', @@ -772,7 +774,8 @@ def callback(message): parser_gcs.add_argument( '--custom_dictionaries', action='append', help='Strings representing comma-delimited lists of dictionary words' - ' to search for as custom info types.', + ' to search for as custom info types. Each string is a comma ' + 'delimited list of words representing a distinct dictionary.', default=None) parser_gcs.add_argument( '--custom_regexes', action='append', @@ -826,7 +829,8 @@ def callback(message): parser_datastore.add_argument( '--custom_dictionaries', action='append', help='Strings representing comma-delimited lists of dictionary words' - ' to search for as custom info types.', + ' to search for as custom info types. Each string is a comma ' + 'delimited list of words representing a distinct dictionary.', default=None) parser_datastore.add_argument( '--custom_regexes', action='append', @@ -886,7 +890,8 @@ def callback(message): parser_bigquery.add_argument( '--custom_dictionaries', action='append', help='Strings representing comma-delimited lists of dictionary words' - ' to search for as custom info types.', + ' to search for as custom info types. Each string is a comma ' + 'delimited list of words representing a distinct dictionary.', default=None) parser_bigquery.add_argument( '--custom_regexes', action='append', From 08b4ccef20a53994fb392e94f935c7c5b2f8a66d Mon Sep 17 00:00:00 2001 From: mwdaub Date: Tue, 3 Jul 2018 08:54:13 -0700 Subject: [PATCH 17/17] Fix enumerate syntax error. --- dlp/inspect_content.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py index a2b606f66bc..aedc002d465 100644 --- a/dlp/inspect_content.py +++ b/dlp/inspect_content.py @@ -66,7 +66,7 @@ def inspect_string(project, content_string, info_types, regexes = [{ 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, 'regex': {'pattern': custom_regex} - } for i in enumerate(custom_regexes)] + } for i, custom_regex in enumerate(custom_regexes)] custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may @@ -154,7 +154,7 @@ def inspect_file(project, filename, info_types, min_likelihood=None, regexes = [{ 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, 'regex': {'pattern': custom_regex} - } for i in enumerate(custom_regexes)] + } for i, custom_regex in enumerate(custom_regexes)] custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may @@ -267,7 +267,7 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id, regexes = [{ 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, 'regex': {'pattern': custom_regex} - } for i in enumerate(custom_regexes)] + } for i, custom_regex in enumerate(custom_regexes)] custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may @@ -413,7 +413,7 @@ def inspect_datastore(project, datastore_project, kind, regexes = [{ 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, 'regex': {'pattern': custom_regex} - } for i in enumerate(custom_regexes)] + } for i, custom_regex in enumerate(custom_regexes)] custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may @@ -564,7 +564,7 @@ def inspect_bigquery(project, bigquery_project, dataset_id, table_id, regexes = [{ 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, 'regex': {'pattern': custom_regex} - } for i in enumerate(custom_regexes)] + } for i, custom_regex in enumerate(custom_regexes)] custom_info_types = dictionaries + regexes # Construct the configuration dictionary. Keys which are None may