diff --git a/storage/cost-analysis/README.md b/storage/cost-analysis/README.md new file mode 100644 index 00000000000..d242d41d43c --- /dev/null +++ b/storage/cost-analysis/README.md @@ -0,0 +1,52 @@ +Google Cloud Storage Python Samples +=============================================================================== + +[![Open in Cloud Shell button](https://gstatic.com/cloudssh/images/open-btn.png)](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=storage/s3-sdk/README.rst) + +**Google Cloud Storage:** https://cloud.google.com/storage/docs + +Samples +------------------------------------------------------------------------------- +NOTE: Due to the specific functionality related to Google Cloud APIs, this guide assumes a base level of familiarity with Google Cloud Storage features, terminology, and pricing. + +### Google Cloud Storage Soft Delete Cost Analyzer +------------------------------------------------------------------------------- +**Understanding Soft Delete and Cost Considerations** + 1. Soft Delete: A feature for protecting against accidental data loss. Deleted objects are retained for a defined period before permanent deletion. This adds safety but carries potential additional storage costs. + 2. Cost Analysis: This script evaluates the relative cost increase within each bucket if soft delete is enabled. Considerations include: + * Your soft delete retention window + * Amount of data likely to be soft-deleted + * Proportions of data in different storage classes (e.g., Standard, Nearline) + +**How to Use the Script** + +**Prerequisites** + + 1. A Google Cloud Platform (GCP) Project with existing buckets. + 2. Permissions on your GCP project to interact with Google Cloud Storage and Monitoring APIs. + 3. A Python environment (https://cloud.google.com/python/setup) + +**Command-Line Arguments** +* `project_name`_ (Required): Specifies your GCP project name. +* `--cost_threshold`_ (Optional, default=0): Sets a relative cost threshold. +* `--soft_delete_window`_ (Optional, default= 604800 (i.e. 7 days)): Time window (in seconds) for considering soft-deleted objects.. +* `--agg_days`_ (Optional, default=30): The period over which to combine and aggregate results. +* `--lookback_days`_ (Optional, default=360): Time window (in days) for considering the how old the bucket to be. +* `--list`_ (Optional): Produces a simple list of bucket names. + +Note: In this sample, cost_threshold 0.15 would spotlight buckets where enabling soft delete might increase costs by over 15%. + +``` code-block:: bash + $ python storage_soft_delete_relative_cost_analyzer.py my-project-name +``` + +**Important Note:** To disable soft-delete for buckets flagged by the script, follow these steps: + +```code-block::bash +# 1. Run the analyzer to generate a list of buckets exceeding your cost threshold: +python storage_soft_delete_relative_cost_analyzer.py [your-project-name] --[OTHER_OPTIONS] --list=True > list_of_buckets.txt + +# 2. Update the buckets using the generated list: +cat list_of_buckets.txt | gcloud storage buckets update -I --clear-soft-delete + +``` \ No newline at end of file diff --git a/storage/cost-analysis/storage_soft_delete_relative_cost_analyzer.py b/storage/cost-analysis/storage_soft_delete_relative_cost_analyzer.py new file mode 100644 index 00000000000..c2641214055 --- /dev/null +++ b/storage/cost-analysis/storage_soft_delete_relative_cost_analyzer.py @@ -0,0 +1,341 @@ +#!/usr/bin/env python + +# Copyright 2024 Google LLC. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Identifies buckets with relative increase in cost on enabling the soft-delete. + +The relative increase in cost of using soft delete is calculated by combining +the storage/v2/deleted_bytes metric with the existing storage/v2/total_byte_seconds +metric. + +Relative cost of each bucket = ('soft delete retention duration' + × 'deleted bytes' / 'total bytes seconds' ) + x 'cost of storing in storage class' + x 'ratio of storage class'. +""" + +# [START storage_soft_delete_relative_cost] + +import argparse +import json +from typing import Dict, List +import google.cloud.monitoring_v3 as monitoring_client + + +def get_relative_cost(storage_class: str) -> float: + """Retrieves the relative cost for a given storage class and location. + + Args: + storage_class: The storage class (e.g., 'standard', 'nearline'). + + Returns: + The price per GB from the https://cloud.google.com/storage/pricing, + divided by the standard storage class. + """ + relative_cost = { + "STANDARD": 0.023 / 0.023, + "NEARLINE": 0.013 / 0.023, + "COLDLINE": 0.007 / 0.023, + "ARCHIVE": 0.0025 / 0.023 + } + + return relative_cost.get(storage_class, 1.0) + + +def get_soft_delete_cost( + project_name: str, + soft_delete_window: int, + agg_days: int, + lookback_days: int +) -> Dict[str, List[Dict[str, float]]]: + """Calculates soft delete costs for buckets in a Google Cloud project. + + Args: + project_name: The name of the Google Cloud project. + soft_delete_window: The time window in seconds for considering + soft-deleted objects (default is 7 days). + agg_days: Aggregate results over a time period, defaults to 30-day period + lookback_days: Look back up to upto days, defaults to 360 days + + Returns: + A dictionary with bucket names as keys and cost data for each bucket, + broken down by storage class. + """ + + query_client = monitoring_client.QueryServiceClient() + + # Step 1: Get storage class ratios for each bucket. + storage_ratios_by_bucket = get_storage_class_ratio( + project_name, query_client, agg_days, lookback_days + ) + + # Step 2: Fetch soft-deleted bytes and calculate costs using Monitoring API. + soft_deleted_costs = calculate_soft_delete_costs( + project_name, + query_client, + soft_delete_window, + storage_ratios_by_bucket, + agg_days, + lookback_days, + ) + + return soft_deleted_costs + + +def calculate_soft_delete_costs( + project_name: str, + query_client: monitoring_client.QueryServiceClient, + soft_delete_window: int, + storage_ratios_by_bucket: Dict[str, float], + agg_days: int, + lookback_days: int, +) -> Dict[str, List[Dict[str, float]]]: + """Calculates the relative cost of enabling soft delete for each bucket in a + project for certain time frame in secs. + + Args: + project_name: The name of the Google Cloud project. + query_client: A Monitoring API query client. + soft_delete_window: The time window in seconds for considering + soft-deleted objects (default is 7 days). + storage_ratios_by_bucket: A dictionary of storage class ratios per bucket. + agg_days: Aggregate results over a time period, defaults to 30-day period + lookback_days: Look back up to upto days, defaults to 360 days + + Returns: + A dictionary with bucket names as keys and a list of cost data + dictionaries + for each bucket, broken down by storage class. + """ + soft_deleted_bytes_time = query_client.query_time_series( + monitoring_client.QueryTimeSeriesRequest( + name=f"projects/{project_name}", + query=f""" + {{ + fetch gcs_bucket :: storage.googleapis.com/storage/v2/deleted_bytes + | group_by [resource.bucket_name, metric.storage_class, resource.location], window(), .sum; + fetch gcs_bucket :: storage.googleapis.com/storage/v2/total_byte_seconds + | group_by [resource.bucket_name, metric.storage_class, resource.location], window(), .sum + }} + | ratio # Calculate ratios of deleted btyes to total bytes seconds + | value val(0) * {soft_delete_window}\'s\' + | every {agg_days}d + | within {lookback_days}d + """, + ) + ) + buckets: Dict[str, List[Dict[str, float]]] = {} + missing_distribution_storage_class = [] + for data_point in soft_deleted_bytes_time.time_series_data: + bucket_name = data_point.label_values[0].string_value + storage_class = data_point.label_values[1].string_value + # To include location-based cost analysis: + # 1. Uncomment the line below: + # location = data_point.label_values[2].string_value + # 2. Update how you calculate 'relative_storage_class_cost' to factor in location + soft_delete_ratio = data_point.point_data[0].values[0].double_value + distribution_storage_class = bucket_name + " - " + storage_class + storage_class_ratio = storage_ratios_by_bucket.get( + distribution_storage_class) + if storage_class_ratio is None: + missing_distribution_storage_class.append( + distribution_storage_class) + buckets.setdefault(bucket_name, []).append({ + # Include storage class and location data for additional plotting dimensions. + # "storage_class": storage_class, + # 'location': location, + "soft_delete_ratio": soft_delete_ratio, + "storage_class_ratio": storage_class_ratio, + "relative_storage_class_cost": get_relative_cost(storage_class) + }) + + if missing_distribution_storage_class: + print("Missing storage class for following buckets:", + missing_distribution_storage_class) + raise ValueError("Cannot proceed with missing storage class ratios.") + + return buckets + + +def get_storage_class_ratio( + project_name: str, + query_client: monitoring_client.QueryServiceClient, + agg_days: int, + lookback_days: int, +) -> Dict[str, float]: + """Calculates storage class ratios for each bucket in a project. + + This information helps determine the relative cost contribution of each + storage class to the overall soft-delete cost. + + Args: + project_name: The Google Cloud project name. + query_client: Google Cloud's Monitoring Client's QueryServiceClient. + agg_days: Aggregate results over a time period, defaults to 30-day period + lookback_days: Look back up to upto days, defaults to 360 days + + Returns: + Ratio of Storage classes within a bucket. + """ + request = monitoring_client.QueryTimeSeriesRequest( + name=f"projects/{project_name}", + query=f""" + {{ + # Fetch total byte-seconds for each bucket and storage class + fetch gcs_bucket :: storage.googleapis.com/storage/v2/total_byte_seconds + | group_by [resource.bucket_name, metric.storage_class], window(), .sum; + # Fetch total byte-seconds for each bucket (regardless of class) + fetch gcs_bucket :: storage.googleapis.com/storage/v2/total_byte_seconds + | group_by [resource.bucket_name], window(), .sum + }} + | ratio # Calculate ratios of storage class size to total size + | every {agg_days}d + | within {lookback_days}d + """, + ) + + storage_class_ratio = query_client.query_time_series(request) + + storage_ratios_by_bucket = {} + for time_series in storage_class_ratio.time_series_data: + bucket_name = time_series.label_values[0].string_value + storage_class = time_series.label_values[1].string_value + ratio = time_series.point_data[0].values[0].double_value + + # Create a descriptive key for the dictionary + key = f"{bucket_name} - {storage_class}" + storage_ratios_by_bucket[key] = ratio + + return storage_ratios_by_bucket + + +def soft_delete_relative_cost_analyzer( + project_name: str, + cost_threshold: float = 0.0, + soft_delete_window: int = 604800, + agg_days: int = 30, + lookback_days: int = 360, + list_buckets: bool = False, +) -> str | Dict[str, float]: # Note potential string output + """Identifies buckets exceeding the relative cost threshold for enabling soft delete. + + Args: + project_name: The Google Cloud project name. + cost_threshold: Threshold above which to consider removing soft delete. + soft_delete_window: Time window for calculating soft-delete costs (in + seconds). + agg_days: Aggregate results over this time period (in days). + lookback_days: Look back up to this many days. + list_buckets: Return a list of bucket names (True) or JSON (False, + default). + + Returns: + JSON formatted results of buckets exceeding the threshold and costs + *or* a space-separated string of bucket names. + """ + + buckets: Dict[str, float] = {} + for bucket_name, storage_sources in get_soft_delete_cost( + project_name, soft_delete_window, agg_days, lookback_days + ).items(): + bucket_cost = 0.0 + for storage_source in storage_sources: + bucket_cost += ( + storage_source["soft_delete_ratio"] + * storage_source["storage_class_ratio"] + * storage_source["relative_storage_class_cost"] + ) + if bucket_cost > cost_threshold: + buckets[bucket_name] = round(bucket_cost, 4) + + if list_buckets: + return " ".join(buckets.keys()) # Space-separated bucket names + else: + return json.dumps(buckets, indent=2) # JSON output + + +def soft_delete_relative_cost_analyzer_main() -> None: + # Sample run: python storage_soft_delete_relative_cost_analyzer.py + parser = argparse.ArgumentParser( + description="Analyze and manage Google Cloud Storage soft-delete costs." + ) + parser.add_argument( + "project_name", help="The name of the Google Cloud project to analyze." + ) + parser.add_argument( + "--cost_threshold", + type=float, + default=0.0, + help="Relative Cost threshold.", + ) + parser.add_argument( + "--soft_delete_window", + type=int, + default=604800, + help="Time window (in seconds) for considering soft-deleted objects.", + ) + parser.add_argument( + "--agg_days", + type=int, + default=30, + help=( + "Time window (in days) for aggregating results over a time period," + " defaults to 30-day period" + ), + ) + parser.add_argument( + "--lookback_days", + type=int, + default=360, + help=( + "Time window (in days) for considering the how old the bucket to be." + ), + ) + parser.add_argument( + "--list", + type=bool, + default=False, + help="Return the list of bucketnames seperated by space.", + ) + + args = parser.parse_args() + + response = soft_delete_relative_cost_analyzer( + args.project_name, + args.cost_threshold, + args.soft_delete_window, + args.agg_days, + args.lookback_days, + args.list, + ) + if (not args.list): + print( + "To remove soft-delete policy from the listed buckets run:\n" + # Capture output + "python storage_soft_delete_relative_cost_analyzer.py [your-project-name] --[OTHER_OPTIONS] --list >" + " list_of_buckets.txt\n" + "cat list_of_buckets.txt | gcloud storage buckets update -I" + " --clear-soft-delete\n", + "\nThe buckets with approximate costs for soft delete:\n", + response, + ) + return + print(response) + + +if __name__ == "__main__": + soft_delete_relative_cost_analyzer_main() +# [END storage_soft_delete_relative_cost]