@@ -966,7 +966,7 @@ def _tune_save_checkpoint(self):
966
966
return
967
967
with tune .checkpoint_dir (step = self .state .global_step ) as checkpoint_dir :
968
968
output_dir = os .path .join (checkpoint_dir , f"{ PREFIX_CHECKPOINT_DIR } -{ self .state .global_step } " )
969
- self .save_model (output_dir )
969
+ self .save_model (output_dir , _internal_call = True )
970
970
if self .args .should_save :
971
971
self .state .save_to_json (os .path .join (output_dir , TRAINER_STATE_NAME ))
972
972
torch .save (self .optimizer .state_dict (), os .path .join (output_dir , OPTIMIZER_NAME ))
@@ -1634,7 +1634,7 @@ def _save_checkpoint(self, model, trial, metrics=None):
1634
1634
self .store_flos ()
1635
1635
1636
1636
output_dir = os .path .join (run_dir , checkpoint_folder )
1637
- self .save_model (output_dir )
1637
+ self .save_model (output_dir , _internal_call = True )
1638
1638
if self .deepspeed :
1639
1639
# under zero3 model file itself doesn't get saved since it's bogus! Unless deepspeed
1640
1640
# config `stage3_gather_fp16_weights_on_model_save` is True
@@ -2002,7 +2002,7 @@ def is_world_process_zero(self) -> bool:
2002
2002
else :
2003
2003
return self .args .process_index == 0
2004
2004
2005
- def save_model (self , output_dir : Optional [str ] = None ):
2005
+ def save_model (self , output_dir : Optional [str ] = None , _internal_call : bool = False ):
2006
2006
"""
2007
2007
Will save the model, so you can reload it using `from_pretrained()`.
2008
2008
@@ -2051,6 +2051,10 @@ def save_model(self, output_dir: Optional[str] = None):
2051
2051
elif self .args .should_save :
2052
2052
self ._save (output_dir )
2053
2053
2054
+ # Push to the Hub when `save_model` is called by the user.
2055
+ if self .args .push_to_hub and not _internal_call :
2056
+ self .push_to_hub (commit_message = "Model save" )
2057
+
2054
2058
def _save_tpu (self , output_dir : Optional [str ] = None ):
2055
2059
output_dir = output_dir if output_dir is not None else self .args .output_dir
2056
2060
logger .info (f"Saving model checkpoint to { output_dir } " )
@@ -2768,9 +2772,10 @@ def push_to_hub(self, commit_message: Optional[str] = "End of training", blockin
2768
2772
model_name = Path (self .args .output_dir ).name
2769
2773
else :
2770
2774
model_name = self .args .hub_model_id .split ("/" )[- 1 ]
2775
+
2771
2776
# Needs to be executed on all processes for TPU training, but will only save on the processed determined by
2772
2777
# self.args.should_save.
2773
- self .save_model ()
2778
+ self .save_model (_internal_call = True )
2774
2779
2775
2780
# Only push from one node.
2776
2781
if not self .is_world_process_zero ():
0 commit comments