feat: add code samples for distillation (GoogleCloudPlatform#11156)

liulehui · web-flow · commit cee3e73b8ddc · 2024-02-12T15:03:55.000+01:00
* feat: add code samples for distillation

* fix test

* fix test
diff --git a/generative_ai/distillation.py b/generative_ai/distillation.py
@@ -0,0 +1,69 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START aiplatform_sdk_distillation]
+from __future__ import annotations
+
+
+from typing import Optional
+
+
+from google.auth import default
+import vertexai
+from vertexai.preview.language_models import TextGenerationModel, TuningEvaluationSpec
+
+
+credentials, _ = default(scopes=["https://www.googleapis.com/auth/cloud-platform"])
+
+
+def distill_model(
+    project_id: str,
+    location: str,
+    dataset: str,
+    teacher_model: str,
+    train_steps: int = 300,
+    evaluation_dataset: Optional[str] = None,
+) -> None:
+    """Distill a new model.
+
+    Args:
+      project_id: GCP Project ID, used to initialize vertexai
+      location: GCP Region, used to initialize vertexai
+      dataset: GCS URI of jsonl file.
+      teacher_model: Name of the teacher model.
+      train_steps: Number of training steps to use when tuning the model.
+      evaluation_dataset: GCS URI of jsonl file of evaluation data.
+    """
+    vertexai.init(project=project_id, location=location, credentials=credentials)
+
+    eval_spec = TuningEvaluationSpec(evaluation_data=evaluation_dataset)
+
+    student_model = TextGenerationModel.from_pretrained("text-bison@002")
+    student_model.distill_from(
+        teacher_model=teacher_model,
+        dataset=dataset,
+        # Optional:
+        train_steps=train_steps,
+        tuning_job_location="europe-west4",
+        tuned_model_location=location,
+        tuning_evaluation_spec=eval_spec,
+    )
+
+    print(student_model._job.status)
+    return student_model
+
+
+if __name__ == "__main__":
+    distill_model()
+# [END aiplatform_sdk_distillation]
diff --git a/generative_ai/distillation_test.py b/generative_ai/distillation_test.py
@@ -0,0 +1,108 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import uuid
+
+from google.cloud import aiplatform
+from google.cloud import storage
+from google.cloud.aiplatform.compat.types import pipeline_state
+import pytest
+from vertexai.preview.language_models import TextGenerationModel
+
+import distillation
+
+_PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT")
+_LOCATION = "us-central1"
+_BUCKET = os.environ["CLOUD_STORAGE_BUCKET"]
+
+
+def get_model_display_name(tuned_model: TextGenerationModel) -> str:
+    language_model_tuning_job = tuned_model._job
+    pipeline_job = language_model_tuning_job._job
+    return dict(pipeline_job._gca_resource.runtime_config.parameter_values)[
+        "model_display_name"
+    ]
+
+
+def upload_to_gcs(bucket: str, name: str, data: str) -> None:
+    client = storage.Client()
+    bucket = client.get_bucket(bucket)
+    blob = bucket.blob(name)
+    blob.upload_from_string(data)
+
+
+def download_from_gcs(bucket: str, name: str) -> str:
+    client = storage.Client()
+    bucket = client.get_bucket(bucket)
+    blob = bucket.blob(name)
+    data = blob.download_as_bytes()
+    return "\n".join(data.decode().splitlines()[:10])
+
+
+def delete_from_gcs(bucket: str, name: str) -> None:
+    client = storage.Client()
+    bucket = client.get_bucket(bucket)
+    blob = bucket.blob(name)
+    blob.delete()
+
+
+@pytest.fixture(scope="function")
+def training_data_filename() -> str:
+    temp_filename = f"{uuid.uuid4()}.jsonl"
+    data = download_from_gcs(
+        "cloud-samples-data", "ai-platform/generative_ai/headline_classification.jsonl"
+    )
+    upload_to_gcs(_BUCKET, temp_filename, data)
+    try:
+        yield f"gs://{_BUCKET}/{temp_filename}"
+    finally:
+        delete_from_gcs(_BUCKET, temp_filename)
+
+
+def teardown_model(
+    tuned_model: TextGenerationModel, training_data_filename: str
+) -> None:
+    for tuned_model_name in tuned_model.list_tuned_model_names():
+        model_registry = aiplatform.models.ModelRegistry(model=tuned_model_name)
+        if (
+            training_data_filename
+            in model_registry.get_version_info("1").model_display_name
+        ):
+            display_name = model_registry.get_version_info("1").model_display_name
+            for endpoint in aiplatform.Endpoint.list():
+                for _ in endpoint.list_models():
+                    if endpoint.display_name == display_name:
+                        endpoint.undeploy_all()
+                        endpoint.delete()
+            aiplatform.Model(model_registry.model_resource_name).delete()
+
+
+def test_distill_model(training_data_filename: str) -> None:
+    """Takes approx. 60 minutes."""
+    student_model = distillation.distill_model(
+        dataset=training_data_filename,
+        teacher_model="text-unicorn@001",
+        project_id=_PROJECT_ID,
+        location=_LOCATION,
+        train_steps=1,
+        evaluation_dataset=training_data_filename,
+    )
+    try:
+        assert (
+            student_model._job.status
+            == pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED
+        )
+    finally:
+        teardown_model(student_model, training_data_filename)