|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | +# Copyright 2024 Google LLC. |
| 4 | +# |
| 5 | +# Licensed under the Apache License, Version 2.0 (the 'License'); |
| 6 | +# you may not use this file except in compliance with the License. |
| 7 | +# You may obtain a copy of the License at |
| 8 | +# |
| 9 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +# |
| 11 | +# Unless required by applicable law or agreed to in writing, software |
| 12 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | +# See the License for the specific language governing permissions and |
| 15 | +# limitations under the License. |
| 16 | + |
| 17 | +""" |
| 18 | +Identifies buckets with relative increase in cost on enabling the soft-delete. |
| 19 | +
|
| 20 | +The relative increase in cost of using soft delete is calculated by combining |
| 21 | +the storage/v2/deleted_bytes metric with the existing storage/v2/total_byte_seconds |
| 22 | +metric. |
| 23 | +
|
| 24 | +Relative cost of each bucket = ('soft delete retention duration' |
| 25 | + × 'deleted bytes' / 'total bytes seconds' ) |
| 26 | + x 'cost of storing in storage class' |
| 27 | + x 'ratio of storage class'. |
| 28 | +""" |
| 29 | + |
| 30 | +# [START storage_soft_delete_relative_cost] |
| 31 | + |
| 32 | +import argparse |
| 33 | +import json |
| 34 | +from typing import Dict, List |
| 35 | +import google.cloud.monitoring_v3 as monitoring_client |
| 36 | + |
| 37 | + |
| 38 | +def get_relative_cost(storage_class: str) -> float: |
| 39 | + """Retrieves the relative cost for a given storage class and location. |
| 40 | +
|
| 41 | + Args: |
| 42 | + storage_class: The storage class (e.g., 'standard', 'nearline'). |
| 43 | +
|
| 44 | + Returns: |
| 45 | + The price per GB from the https://cloud.google.com/storage/pricing, |
| 46 | + divided by the standard storage class. |
| 47 | + """ |
| 48 | + relative_cost = { |
| 49 | + "STANDARD": 0.023 / 0.023, |
| 50 | + "NEARLINE": 0.013 / 0.023, |
| 51 | + "COLDLINE": 0.007 / 0.023, |
| 52 | + "ARCHIVE": 0.0025 / 0.023 |
| 53 | + } |
| 54 | + |
| 55 | + return relative_cost.get(storage_class, 1.0) |
| 56 | + |
| 57 | + |
| 58 | +def get_soft_delete_cost( |
| 59 | + project_name: str, |
| 60 | + soft_delete_window: int, |
| 61 | + agg_days: int, |
| 62 | + lookback_days: int |
| 63 | +) -> Dict[str, List[Dict[str, float]]]: |
| 64 | + """Calculates soft delete costs for buckets in a Google Cloud project. |
| 65 | +
|
| 66 | + Args: |
| 67 | + project_name: The name of the Google Cloud project. |
| 68 | + soft_delete_window: The time window in seconds for considering |
| 69 | + soft-deleted objects (default is 7 days). |
| 70 | + agg_days: Aggregate results over a time period, defaults to 30-day period |
| 71 | + lookback_days: Look back up to upto days, defaults to 360 days |
| 72 | +
|
| 73 | + Returns: |
| 74 | + A dictionary with bucket names as keys and cost data for each bucket, |
| 75 | + broken down by storage class. |
| 76 | + """ |
| 77 | + |
| 78 | + query_client = monitoring_client.QueryServiceClient() |
| 79 | + |
| 80 | + # Step 1: Get storage class ratios for each bucket. |
| 81 | + storage_ratios_by_bucket = get_storage_class_ratio( |
| 82 | + project_name, query_client, agg_days, lookback_days |
| 83 | + ) |
| 84 | + |
| 85 | + # Step 2: Fetch soft-deleted bytes and calculate costs using Monitoring API. |
| 86 | + soft_deleted_costs = calculate_soft_delete_costs( |
| 87 | + project_name, |
| 88 | + query_client, |
| 89 | + soft_delete_window, |
| 90 | + storage_ratios_by_bucket, |
| 91 | + agg_days, |
| 92 | + lookback_days, |
| 93 | + ) |
| 94 | + |
| 95 | + return soft_deleted_costs |
| 96 | + |
| 97 | + |
| 98 | +def calculate_soft_delete_costs( |
| 99 | + project_name: str, |
| 100 | + query_client: monitoring_client.QueryServiceClient, |
| 101 | + soft_delete_window: int, |
| 102 | + storage_ratios_by_bucket: Dict[str, float], |
| 103 | + agg_days: int, |
| 104 | + lookback_days: int, |
| 105 | +) -> Dict[str, List[Dict[str, float]]]: |
| 106 | + """Calculates the relative cost of enabling soft delete for each bucket in a |
| 107 | + project for certain time frame in secs. |
| 108 | +
|
| 109 | + Args: |
| 110 | + project_name: The name of the Google Cloud project. |
| 111 | + query_client: A Monitoring API query client. |
| 112 | + soft_delete_window: The time window in seconds for considering |
| 113 | + soft-deleted objects (default is 7 days). |
| 114 | + storage_ratios_by_bucket: A dictionary of storage class ratios per bucket. |
| 115 | + agg_days: Aggregate results over a time period, defaults to 30-day period |
| 116 | + lookback_days: Look back up to upto days, defaults to 360 days |
| 117 | +
|
| 118 | + Returns: |
| 119 | + A dictionary with bucket names as keys and a list of cost data |
| 120 | + dictionaries |
| 121 | + for each bucket, broken down by storage class. |
| 122 | + """ |
| 123 | + soft_deleted_bytes_time = query_client.query_time_series( |
| 124 | + monitoring_client.QueryTimeSeriesRequest( |
| 125 | + name=f"projects/{project_name}", |
| 126 | + query=f""" |
| 127 | + {{ |
| 128 | + fetch gcs_bucket :: storage.googleapis.com/storage/v2/deleted_bytes |
| 129 | + | group_by [resource.bucket_name, metric.storage_class, resource.location], window(), .sum; |
| 130 | + fetch gcs_bucket :: storage.googleapis.com/storage/v2/total_byte_seconds |
| 131 | + | group_by [resource.bucket_name, metric.storage_class, resource.location], window(), .sum |
| 132 | + }} |
| 133 | + | ratio # Calculate ratios of deleted btyes to total bytes seconds |
| 134 | + | value val(0) * {soft_delete_window}\'s\' |
| 135 | + | every {agg_days}d |
| 136 | + | within {lookback_days}d |
| 137 | + """, |
| 138 | + ) |
| 139 | + ) |
| 140 | + buckets: Dict[str, List[Dict[str, float]]] = {} |
| 141 | + missing_distribution_storage_class = [] |
| 142 | + for data_point in soft_deleted_bytes_time.time_series_data: |
| 143 | + bucket_name = data_point.label_values[0].string_value |
| 144 | + storage_class = data_point.label_values[1].string_value |
| 145 | + # To include location-based cost analysis: |
| 146 | + # 1. Uncomment the line below: |
| 147 | + # location = data_point.label_values[2].string_value |
| 148 | + # 2. Update how you calculate 'relative_storage_class_cost' to factor in location |
| 149 | + soft_delete_ratio = data_point.point_data[0].values[0].double_value |
| 150 | + distribution_storage_class = bucket_name + " - " + storage_class |
| 151 | + storage_class_ratio = storage_ratios_by_bucket.get( |
| 152 | + distribution_storage_class) |
| 153 | + if storage_class_ratio is None: |
| 154 | + missing_distribution_storage_class.append( |
| 155 | + distribution_storage_class) |
| 156 | + buckets.setdefault(bucket_name, []).append({ |
| 157 | + # Include storage class and location data for additional plotting dimensions. |
| 158 | + # "storage_class": storage_class, |
| 159 | + # 'location': location, |
| 160 | + "soft_delete_ratio": soft_delete_ratio, |
| 161 | + "storage_class_ratio": storage_class_ratio, |
| 162 | + "relative_storage_class_cost": get_relative_cost(storage_class) |
| 163 | + }) |
| 164 | + |
| 165 | + if missing_distribution_storage_class: |
| 166 | + print("Missing storage class for following buckets:", |
| 167 | + missing_distribution_storage_class) |
| 168 | + raise ValueError("Cannot proceed with missing storage class ratios.") |
| 169 | + |
| 170 | + return buckets |
| 171 | + |
| 172 | + |
| 173 | +def get_storage_class_ratio( |
| 174 | + project_name: str, |
| 175 | + query_client: monitoring_client.QueryServiceClient, |
| 176 | + agg_days: int, |
| 177 | + lookback_days: int, |
| 178 | +) -> Dict[str, float]: |
| 179 | + """Calculates storage class ratios for each bucket in a project. |
| 180 | +
|
| 181 | + This information helps determine the relative cost contribution of each |
| 182 | + storage class to the overall soft-delete cost. |
| 183 | +
|
| 184 | + Args: |
| 185 | + project_name: The Google Cloud project name. |
| 186 | + query_client: Google Cloud's Monitoring Client's QueryServiceClient. |
| 187 | + agg_days: Aggregate results over a time period, defaults to 30-day period |
| 188 | + lookback_days: Look back up to upto days, defaults to 360 days |
| 189 | +
|
| 190 | + Returns: |
| 191 | + Ratio of Storage classes within a bucket. |
| 192 | + """ |
| 193 | + request = monitoring_client.QueryTimeSeriesRequest( |
| 194 | + name=f"projects/{project_name}", |
| 195 | + query=f""" |
| 196 | + {{ |
| 197 | + # Fetch total byte-seconds for each bucket and storage class |
| 198 | + fetch gcs_bucket :: storage.googleapis.com/storage/v2/total_byte_seconds |
| 199 | + | group_by [resource.bucket_name, metric.storage_class], window(), .sum; |
| 200 | + # Fetch total byte-seconds for each bucket (regardless of class) |
| 201 | + fetch gcs_bucket :: storage.googleapis.com/storage/v2/total_byte_seconds |
| 202 | + | group_by [resource.bucket_name], window(), .sum |
| 203 | + }} |
| 204 | + | ratio # Calculate ratios of storage class size to total size |
| 205 | + | every {agg_days}d |
| 206 | + | within {lookback_days}d |
| 207 | + """, |
| 208 | + ) |
| 209 | + |
| 210 | + storage_class_ratio = query_client.query_time_series(request) |
| 211 | + |
| 212 | + storage_ratios_by_bucket = {} |
| 213 | + for time_series in storage_class_ratio.time_series_data: |
| 214 | + bucket_name = time_series.label_values[0].string_value |
| 215 | + storage_class = time_series.label_values[1].string_value |
| 216 | + ratio = time_series.point_data[0].values[0].double_value |
| 217 | + |
| 218 | + # Create a descriptive key for the dictionary |
| 219 | + key = f"{bucket_name} - {storage_class}" |
| 220 | + storage_ratios_by_bucket[key] = ratio |
| 221 | + |
| 222 | + return storage_ratios_by_bucket |
| 223 | + |
| 224 | + |
| 225 | +def soft_delete_relative_cost_analyzer( |
| 226 | + project_name: str, |
| 227 | + cost_threshold: float = 0.0, |
| 228 | + soft_delete_window: int = 604800, |
| 229 | + agg_days: int = 30, |
| 230 | + lookback_days: int = 360, |
| 231 | + list_buckets: bool = False, |
| 232 | +) -> str | Dict[str, float]: # Note potential string output |
| 233 | + """Identifies buckets exceeding the relative cost threshold for enabling soft delete. |
| 234 | +
|
| 235 | + Args: |
| 236 | + project_name: The Google Cloud project name. |
| 237 | + cost_threshold: Threshold above which to consider removing soft delete. |
| 238 | + soft_delete_window: Time window for calculating soft-delete costs (in |
| 239 | + seconds). |
| 240 | + agg_days: Aggregate results over this time period (in days). |
| 241 | + lookback_days: Look back up to this many days. |
| 242 | + list_buckets: Return a list of bucket names (True) or JSON (False, |
| 243 | + default). |
| 244 | +
|
| 245 | + Returns: |
| 246 | + JSON formatted results of buckets exceeding the threshold and costs |
| 247 | + *or* a space-separated string of bucket names. |
| 248 | + """ |
| 249 | + |
| 250 | + buckets: Dict[str, float] = {} |
| 251 | + for bucket_name, storage_sources in get_soft_delete_cost( |
| 252 | + project_name, soft_delete_window, agg_days, lookback_days |
| 253 | + ).items(): |
| 254 | + bucket_cost = 0.0 |
| 255 | + for storage_source in storage_sources: |
| 256 | + bucket_cost += ( |
| 257 | + storage_source["soft_delete_ratio"] |
| 258 | + * storage_source["storage_class_ratio"] |
| 259 | + * storage_source["relative_storage_class_cost"] |
| 260 | + ) |
| 261 | + if bucket_cost > cost_threshold: |
| 262 | + buckets[bucket_name] = round(bucket_cost, 4) |
| 263 | + |
| 264 | + if list_buckets: |
| 265 | + return " ".join(buckets.keys()) # Space-separated bucket names |
| 266 | + else: |
| 267 | + return json.dumps(buckets, indent=2) # JSON output |
| 268 | + |
| 269 | + |
| 270 | +def soft_delete_relative_cost_analyzer_main() -> None: |
| 271 | + # Sample run: python storage_soft_delete_relative_cost_analyzer.py <Project Name> |
| 272 | + parser = argparse.ArgumentParser( |
| 273 | + description="Analyze and manage Google Cloud Storage soft-delete costs." |
| 274 | + ) |
| 275 | + parser.add_argument( |
| 276 | + "project_name", help="The name of the Google Cloud project to analyze." |
| 277 | + ) |
| 278 | + parser.add_argument( |
| 279 | + "--cost_threshold", |
| 280 | + type=float, |
| 281 | + default=0.0, |
| 282 | + help="Relative Cost threshold.", |
| 283 | + ) |
| 284 | + parser.add_argument( |
| 285 | + "--soft_delete_window", |
| 286 | + type=int, |
| 287 | + default=604800, |
| 288 | + help="Time window (in seconds) for considering soft-deleted objects.", |
| 289 | + ) |
| 290 | + parser.add_argument( |
| 291 | + "--agg_days", |
| 292 | + type=int, |
| 293 | + default=30, |
| 294 | + help=( |
| 295 | + "Time window (in days) for aggregating results over a time period," |
| 296 | + " defaults to 30-day period" |
| 297 | + ), |
| 298 | + ) |
| 299 | + parser.add_argument( |
| 300 | + "--lookback_days", |
| 301 | + type=int, |
| 302 | + default=360, |
| 303 | + help=( |
| 304 | + "Time window (in days) for considering the how old the bucket to be." |
| 305 | + ), |
| 306 | + ) |
| 307 | + parser.add_argument( |
| 308 | + "--list", |
| 309 | + type=bool, |
| 310 | + default=False, |
| 311 | + help="Return the list of bucketnames seperated by space.", |
| 312 | + ) |
| 313 | + |
| 314 | + args = parser.parse_args() |
| 315 | + |
| 316 | + response = soft_delete_relative_cost_analyzer( |
| 317 | + args.project_name, |
| 318 | + args.cost_threshold, |
| 319 | + args.soft_delete_window, |
| 320 | + args.agg_days, |
| 321 | + args.lookback_days, |
| 322 | + args.list, |
| 323 | + ) |
| 324 | + if (not args.list): |
| 325 | + print( |
| 326 | + "To remove soft-delete policy from the listed buckets run:\n" |
| 327 | + # Capture output |
| 328 | + "python storage_soft_delete_relative_cost_analyzer.py [your-project-name] --[OTHER_OPTIONS] --list >" |
| 329 | + " list_of_buckets.txt\n" |
| 330 | + "cat list_of_buckets.txt | gcloud storage buckets update -I" |
| 331 | + " --clear-soft-delete\n", |
| 332 | + "\nThe buckets with approximate costs for soft delete:\n", |
| 333 | + response, |
| 334 | + ) |
| 335 | + return |
| 336 | + print(response) |
| 337 | + |
| 338 | + |
| 339 | +if __name__ == "__main__": |
| 340 | + soft_delete_relative_cost_analyzer_main() |
| 341 | +# [END storage_soft_delete_relative_cost] |
0 commit comments