diff --git a/CHANGELOG.md b/CHANGELOG.md index 580adc3356..859bc2413b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,18 @@ # Changelog +## v2.245.0 (2025-05-28) + +### Features + + * Correct mypy type checking through PEP 561 + +### Bug Fixes and Other Changes + + * MLFLow update for dependabot + * addWaiterTimeoutHandling + * merge method inputs with class inputs + * update image_uri_configs 05-20-2025 07:18:17 PST + ## v2.244.2 (2025-05-19) ### Bug Fixes and Other Changes diff --git a/VERSION b/VERSION index 505bfa4996..aca3af02c1 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.244.2 +2.245.1.dev0 diff --git a/requirements/extras/test_requirements.txt b/requirements/extras/test_requirements.txt index 8bdd7c8ae3..92273f2c9a 100644 --- a/requirements/extras/test_requirements.txt +++ b/requirements/extras/test_requirements.txt @@ -44,7 +44,7 @@ nbformat>=5.9,<6 accelerate>=0.24.1,<=0.27.0 schema==0.7.5 tensorflow>=2.16.2,<=2.18.0 -mlflow>=2.12.2,<2.13 +mlflow>=2.14.2,<3 huggingface_hub==0.26.2 uvicorn>=0.30.1 fastapi==0.115.4 diff --git a/setup.py b/setup.py index 3deaed54e0..f651c27898 100644 --- a/setup.py +++ b/setup.py @@ -58,7 +58,7 @@ def get_optional_dependencies(): version=HERE.joinpath("VERSION").read_text().strip(), packages=find_packages("src"), package_dir={"": "src"}, - package_data={"": ["*.whl"]}, + package_data={"": ["*.whl", "py.typed"]}, py_modules=[os.path.splitext(os.path.basename(path))[0] for path in glob("src/*.py")], include_package_data=True, install_requires=get_dependencies(), diff --git a/src/sagemaker/image_uri_config/huggingface-llm-neuronx.json b/src/sagemaker/image_uri_config/huggingface-llm-neuronx.json index d79e7637ed..9b7b18ee94 100644 --- a/src/sagemaker/image_uri_config/huggingface-llm-neuronx.json +++ b/src/sagemaker/image_uri_config/huggingface-llm-neuronx.json @@ -4,7 +4,7 @@ "inf2" ], "version_aliases": { - "0.0": "0.0.27" + "0.0": "0.0.28" }, "versions": { "0.0.16": { @@ -589,6 +589,59 @@ "container_version": { "inf2": "ubuntu22.04" } + }, + "0.0.28": { + "py_versions": [ + "py310" + ], + "registries": { + "af-south-1": "626614931356", + "ap-east-1": "871362719292", + "ap-east-2": "975050140332", + "ap-northeast-1": "763104351884", + "ap-northeast-2": "763104351884", + "ap-northeast-3": "364406365360", + "ap-south-1": "763104351884", + "ap-south-2": "772153158452", + "ap-southeast-1": "763104351884", + "ap-southeast-2": "763104351884", + "ap-southeast-3": "907027046896", + "ap-southeast-4": "457447274322", + "ap-southeast-5": "550225433462", + "ap-southeast-7": "590183813437", + "ca-central-1": "763104351884", + "ca-west-1": "204538143572", + "cn-north-1": "727897471807", + "cn-northwest-1": "727897471807", + "eu-central-1": "763104351884", + "eu-central-2": "380420809688", + "eu-north-1": "763104351884", + "eu-south-1": "692866216735", + "eu-south-2": "503227376785", + "eu-west-1": "763104351884", + "eu-west-2": "763104351884", + "eu-west-3": "763104351884", + "il-central-1": "780543022126", + "me-central-1": "914824155844", + "me-south-1": "217643126080", + "mx-central-1": "637423239942", + "sa-east-1": "763104351884", + "us-east-1": "763104351884", + "us-east-2": "763104351884", + "us-gov-east-1": "446045086412", + "us-gov-west-1": "442386744353", + "us-iso-east-1": "886529160074", + "us-isob-east-1": "094389454867", + "us-isof-east-1": "303241398832", + "us-isof-south-1": "454834333376", + "us-west-1": "763104351884", + "us-west-2": "763104351884" + }, + "tag_prefix": "2.1.2-optimum0.0.28", + "repository": "huggingface-pytorch-tgi-inference", + "container_version": { + "inf2": "ubuntu22.04" + } } } } diff --git a/src/sagemaker/modules/train/model_trainer.py b/src/sagemaker/modules/train/model_trainer.py index 58ae724074..2143da4e5c 100644 --- a/src/sagemaker/modules/train/model_trainer.py +++ b/src/sagemaker/modules/train/model_trainer.py @@ -580,7 +580,7 @@ def train( """Train a model using AWS SageMaker. Args: - input_data_config (Optional[Union[List[Channel], Dict[str, DataSourceType]]]): + input_data_config (Optional[List[Union[Channel, InputData]]]): The input data config for the training job. Takes a list of Channel objects or a dictionary of channel names to DataSourceType. DataSourceType can be an S3 URI string, local file path string, @@ -596,11 +596,23 @@ def train( current_training_job_name = _get_unique_name(self.base_job_name) input_data_key_prefix = f"{self.base_job_name}/{current_training_job_name}/input" - self.input_data_config = input_data_config or self.input_data_config or [] + final_input_data_config = self.input_data_config.copy() if self.input_data_config else [] + + if input_data_config: + # merge the inputs with method parameter taking precedence + existing_channels = {input.channel_name: input for input in final_input_data_config} + new_channels = [] + for new_input in input_data_config: + if new_input.channel_name in existing_channels: + existing_channels[new_input.channel_name] = new_input + else: + new_channels.append(new_input) + + final_input_data_config = list(existing_channels.values()) + new_channels - if self.input_data_config: - self.input_data_config = self._get_input_data_config( - self.input_data_config, input_data_key_prefix + if final_input_data_config: + final_input_data_config = self._get_input_data_config( + final_input_data_config, input_data_key_prefix ) if self.checkpoint_config and not self.checkpoint_config.s3_uri: @@ -643,7 +655,7 @@ def train( data_source=self.source_code.source_dir, key_prefix=input_data_key_prefix, ) - self.input_data_config.append(source_code_channel) + final_input_data_config.append(source_code_channel) self._prepare_train_script( tmp_dir=tmp_dir, @@ -664,7 +676,7 @@ def train( data_source=tmp_dir.name, key_prefix=input_data_key_prefix, ) - self.input_data_config.append(sm_drivers_channel) + final_input_data_config.append(sm_drivers_channel) # If source_code is provided, we will always use # the default container entrypoint and arguments @@ -691,7 +703,7 @@ def train( training_job_name=current_training_job_name, algorithm_specification=algorithm_specification, hyper_parameters=string_hyper_parameters, - input_data_config=self.input_data_config, + input_data_config=final_input_data_config, resource_config=resource_config, vpc_config=vpc_config, # Public Instance Attributes @@ -736,7 +748,7 @@ def train( sagemaker_session=self.sagemaker_session, container_entrypoint=algorithm_specification.container_entrypoint, container_arguments=algorithm_specification.container_arguments, - input_data_config=self.input_data_config, + input_data_config=final_input_data_config, hyper_parameters=string_hyper_parameters, environment=self.environment, ) diff --git a/src/sagemaker/predictor_async.py b/src/sagemaker/predictor_async.py index ef70b93599..783d034011 100644 --- a/src/sagemaker/predictor_async.py +++ b/src/sagemaker/predictor_async.py @@ -271,6 +271,7 @@ def _check_output_and_failure_paths(self, output_path, failure_path, waiter_conf output_file_found = threading.Event() failure_file_found = threading.Event() + waiter_error_catched = threading.Event() def check_output_file(): try: @@ -282,7 +283,7 @@ def check_output_file(): ) output_file_found.set() except WaiterError: - pass + waiter_error_catched.set() def check_failure_file(): try: @@ -294,7 +295,7 @@ def check_failure_file(): ) failure_file_found.set() except WaiterError: - pass + waiter_error_catched.set() output_thread = threading.Thread(target=check_output_file) failure_thread = threading.Thread(target=check_failure_file) @@ -302,7 +303,11 @@ def check_failure_file(): output_thread.start() failure_thread.start() - while not output_file_found.is_set() and not failure_file_found.is_set(): + while ( + not output_file_found.is_set() + and not failure_file_found.is_set() + and not waiter_error_catched.is_set() + ): time.sleep(1) if output_file_found.is_set(): @@ -310,17 +315,15 @@ def check_failure_file(): result = self.predictor._handle_response(response=s3_object) return result - failure_object = self.s3_client.get_object(Bucket=failure_bucket, Key=failure_key) - failure_response = self.predictor._handle_response(response=failure_object) + if failure_file_found.is_set(): + failure_object = self.s3_client.get_object(Bucket=failure_bucket, Key=failure_key) + failure_response = self.predictor._handle_response(response=failure_object) + raise AsyncInferenceModelError(message=failure_response) - raise ( - AsyncInferenceModelError(message=failure_response) - if failure_file_found.is_set() - else PollingTimeoutError( - message="Inference could still be running", - output_path=output_path, - seconds=waiter_config.delay * waiter_config.max_attempts, - ) + raise PollingTimeoutError( + message="Inference could still be running", + output_path=output_path, + seconds=waiter_config.delay * waiter_config.max_attempts, ) def update_endpoint( diff --git a/src/sagemaker/py.typed b/src/sagemaker/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/unit/sagemaker/mlflow/test_forward_sagemaker_metrics.py b/tests/unit/sagemaker/mlflow/test_forward_sagemaker_metrics.py index 4b53c93ad4..14502880c3 100644 --- a/tests/unit/sagemaker/mlflow/test_forward_sagemaker_metrics.py +++ b/tests/unit/sagemaker/mlflow/test_forward_sagemaker_metrics.py @@ -48,7 +48,7 @@ def mock_mlflow_client(): def test_encode(): existing_names = set() assert encode("test-name", existing_names) == "test-name" - assert encode("test:name", existing_names) == "test_3a_name" + assert encode("test:name", existing_names) == "test:name" assert encode("test-name", existing_names) == "test-name_1" @@ -183,6 +183,7 @@ def getenv_side_effect(arg, default=None): spec=requests.Response ), "https://test.sagemaker.aws/api/2.0/mlflow/runs/create": Mock(spec=requests.Response), + "https://test.sagemaker.aws/api/2.0/mlflow/runs/update": Mock(spec=requests.Response), "https://test.sagemaker.aws/api/2.0/mlflow/runs/log-batch": [ Mock(spec=requests.Response), Mock(spec=requests.Response), @@ -211,6 +212,11 @@ def getenv_side_effect(arg, default=None): {"run_id": "test_run_id"} ) + mock_responses["https://test.sagemaker.aws/api/2.0/mlflow/runs/update"].status_code = 200 + mock_responses["https://test.sagemaker.aws/api/2.0/mlflow/runs/update"].text = json.dumps( + {"run_id": "test_run_id"} + ) + for mock_response in mock_responses["https://test.sagemaker.aws/api/2.0/mlflow/runs/log-batch"]: mock_response.status_code = 200 mock_response.text = json.dumps({}) @@ -221,6 +227,7 @@ def getenv_side_effect(arg, default=None): mock_request.side_effect = [ mock_responses["https://test.sagemaker.aws/api/2.0/mlflow/experiments/get-by-name"], mock_responses["https://test.sagemaker.aws/api/2.0/mlflow/runs/create"], + mock_responses["https://test.sagemaker.aws/api/2.0/mlflow/runs/update"], *mock_responses["https://test.sagemaker.aws/api/2.0/mlflow/runs/log-batch"], mock_responses["https://test.sagemaker.aws/api/2.0/mlflow/runs/terminate"], ] @@ -231,7 +238,7 @@ def getenv_side_effect(arg, default=None): log_to_mlflow(metrics, params, tags) - assert mock_request.call_count == 6 # Total number of API calls + assert mock_request.call_count == 7 # Total number of API calls @patch("sagemaker.mlflow.forward_sagemaker_metrics.get_training_job_details") diff --git a/tests/unit/sagemaker/modules/train/test_model_trainer.py b/tests/unit/sagemaker/modules/train/test_model_trainer.py index b1348b5ac9..5d4722b8aa 100644 --- a/tests/unit/sagemaker/modules/train/test_model_trainer.py +++ b/tests/unit/sagemaker/modules/train/test_model_trainer.py @@ -1258,3 +1258,44 @@ def mock_upload_data(path, bucket, key_prefix): assert kwargs["tensor_board_output_config"].s3_output_path == default_base_path assert kwargs["tensor_board_output_config"].local_path == "/opt/ml/output/tensorboard" + + +@patch("sagemaker.modules.train.model_trainer.TrainingJob") +def test_input_merge(mock_training_job, modules_session): + model_input = InputData(channel_name="model", data_source="s3://bucket/model/model.tar.gz") + model_trainer = ModelTrainer( + training_image=DEFAULT_IMAGE, + role=DEFAULT_ROLE, + sagemaker_session=modules_session, + compute=DEFAULT_COMPUTE_CONFIG, + input_data_config=[model_input], + ) + + train_input = InputData(channel_name="train", data_source="s3://bucket/data/train") + model_trainer.train(input_data_config=[train_input]) + + mock_training_job.create.assert_called_once() + assert mock_training_job.create.call_args.kwargs["input_data_config"] == [ + Channel( + channel_name="model", + data_source=DataSource( + s3_data_source=S3DataSource( + s3_data_type="S3Prefix", + s3_uri="s3://bucket/model/model.tar.gz", + s3_data_distribution_type="FullyReplicated", + ) + ), + input_mode="File", + ), + Channel( + channel_name="train", + data_source=DataSource( + s3_data_source=S3DataSource( + s3_data_type="S3Prefix", + s3_uri="s3://bucket/data/train", + s3_data_distribution_type="FullyReplicated", + ) + ), + input_mode="File", + ), + ]