@@ -78,8 +78,9 @@ const char* unused_tensors[] = {
78
78
" cond_stage_model.transformer.text_model.embeddings.position_ids" ,
79
79
" cond_stage_model.model.logit_scale" ,
80
80
" cond_stage_model.model.text_projection" ,
81
+ " conditioner.embedders.0.transformer.text_model.embeddings.position_ids" ,
81
82
" conditioner.embedders.0.model.logit_scale" ,
82
- " conditioner.embedders.0 .model.text_projection " ,
83
+ " conditioner.embedders.1 .model.logit_scale " ,
83
84
" model.diffusion_model.time_embedding.cond_proj.weight" ,
84
85
" unet.time_embedding.cond_proj.weight" ,
85
86
" model_ema.decay" ,
@@ -100,11 +101,11 @@ bool is_unused_tensor(std::string name) {
100
101
}
101
102
102
103
std::unordered_map<std::string, std::string> open_clip_to_hf_clip_model = {
103
- {" cond_stage_model. model.ln_final.bias" , " cond_stage_model. transformer.text_model.final_layer_norm.bias" },
104
- {" cond_stage_model. model.ln_final.weight" , " cond_stage_model. transformer.text_model.final_layer_norm.weight" },
105
- {" cond_stage_model. model.positional_embedding" , " cond_stage_model. transformer.text_model.embeddings.position_embedding.weight" },
106
- {" cond_stage_model. model.token_embedding.weight" , " cond_stage_model. transformer.text_model.embeddings.token_embedding.weight" },
107
-
104
+ {" model.ln_final.bias" , " transformer.text_model.final_layer_norm.bias" },
105
+ {" model.ln_final.weight" , " transformer.text_model.final_layer_norm.weight" },
106
+ {" model.positional_embedding" , " transformer.text_model.embeddings.position_embedding.weight" },
107
+ {" model.token_embedding.weight" , " transformer.text_model.embeddings.token_embedding.weight" },
108
+ { " model.text_projection " , " transformer.text_model.text_projection " },
108
109
};
109
110
110
111
std::unordered_map<std::string, std::string> open_clip_to_hk_clip_resblock = {
@@ -133,11 +134,21 @@ std::unordered_map<std::string, std::string> vae_decoder_name_map = {
133
134
134
135
std::string convert_open_clip_to_hf_clip (const std::string& name) {
135
136
std::string new_name = name;
137
+ std::string prefix;
136
138
if (starts_with (new_name, " conditioner.embedders.0." )) {
137
- new_name = " cond_stage_model." + new_name.substr (strlen (" conditioner.embedders.0." ));
139
+ prefix = " cond_stage_model." ;
140
+ new_name = new_name.substr (strlen (" conditioner.embedders.0." ));
141
+ } else if (starts_with (new_name, " conditioner.embedders.1." )) {
142
+ prefix = " cond_stage_model.1." ;
143
+ new_name = new_name.substr (strlen (" conditioner.embedders.0." ));
144
+ } else if (starts_with (new_name, " cond_stage_model." )) {
145
+ prefix = " cond_stage_model." ;
146
+ new_name = new_name.substr (strlen (" cond_stage_model." ));
147
+ } else {
148
+ return new_name;
138
149
}
139
- std::string open_clip_resblock_prefix = " cond_stage_model. model.transformer.resblocks." ;
140
- std::string hf_clip_resblock_prefix = " cond_stage_model. transformer.text_model.encoder.layers." ;
150
+ std::string open_clip_resblock_prefix = " model.transformer.resblocks." ;
151
+ std::string hf_clip_resblock_prefix = " transformer.text_model.encoder.layers." ;
141
152
142
153
if (open_clip_to_hf_clip_model.find (new_name) != open_clip_to_hf_clip_model.end ()) {
143
154
new_name = open_clip_to_hf_clip_model[new_name];
@@ -156,7 +167,7 @@ std::string convert_open_clip_to_hf_clip(const std::string& name) {
156
167
}
157
168
}
158
169
159
- return new_name;
170
+ return prefix + new_name;
160
171
}
161
172
162
173
std::string convert_vae_decoder_name (const std::string& name) {
@@ -358,7 +369,7 @@ std::string convert_diffusers_name_to_compvis(const std::string& key, char seq)
358
369
359
370
std::string convert_tensor_name (const std::string& name) {
360
371
std::string new_name;
361
- if (starts_with (name, " cond_stage_model.model " ) || starts_with (name, " conditioner.embedders.0.model " )) {
372
+ if (starts_with (name, " cond_stage_model." ) || starts_with (name, " conditioner.embedders." )) {
362
373
new_name = convert_open_clip_to_hf_clip (name);
363
374
} else if (starts_with (name, " first_stage_model.decoder" )) {
364
375
new_name = convert_vae_decoder_name (name);
@@ -419,7 +430,7 @@ void preprocess_tensor(TensorStorage tensor_storage,
419
430
420
431
tensor_storage.name = new_name;
421
432
422
- if (starts_with ( new_name, " cond_stage_model. transformer.text_model.encoder.layers." ) &&
433
+ if (new_name. find ( " transformer.text_model.encoder.layers." ) != std::string::npos &&
423
434
ends_with (new_name, " attn.in_proj_weight" )) {
424
435
size_t prefix_size = new_name.find (" attn.in_proj_weight" );
425
436
std::string prefix = new_name.substr (0 , prefix_size);
@@ -431,7 +442,7 @@ void preprocess_tensor(TensorStorage tensor_storage,
431
442
432
443
processed_tensor_storages.insert (processed_tensor_storages.end (), chunks.begin (), chunks.end ());
433
444
434
- } else if (starts_with ( new_name, " cond_stage_model. transformer.text_model.encoder.layers." ) &&
445
+ } else if (new_name. find ( " transformer.text_model.encoder.layers." ) != std::string::npos &&
435
446
ends_with (new_name, " attn.in_proj_bias" )) {
436
447
size_t prefix_size = new_name.find (" attn.in_proj_bias" );
437
448
std::string prefix = new_name.substr (0 , prefix_size);
@@ -1163,15 +1174,20 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s
1163
1174
}
1164
1175
1165
1176
SDVersion ModelLoader::get_sd_version () {
1177
+ // return VERSION_1_x;
1166
1178
TensorStorage token_embedding_weight;
1167
1179
for (auto & tensor_storage : tensor_storages) {
1180
+ if (tensor_storage.name .find (" conditioner.embedders.1" ) != std::string::npos) {
1181
+ return VERSION_XL;
1182
+ }
1168
1183
if (tensor_storage.name == " cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" ||
1169
1184
tensor_storage.name == " cond_stage_model.model.token_embedding.weight" ||
1170
1185
tensor_storage.name == " text_model.embeddings.token_embedding.weight" ||
1171
1186
tensor_storage.name == " te.text_model.embeddings.token_embedding.weight" ||
1172
- tensor_storage.name == " conditioner.embedders.0.model.token_embedding.weight" ) {
1187
+ tensor_storage.name == " conditioner.embedders.0.model.token_embedding.weight" ||
1188
+ tensor_storage.name == " conditioner.embedders.0.transformer.text_model.embeddings.token_embedding.weight" ) {
1173
1189
token_embedding_weight = tensor_storage;
1174
- break ;
1190
+ // break;
1175
1191
}
1176
1192
}
1177
1193
if (token_embedding_weight.ne [0 ] == 768 ) {
@@ -1275,7 +1291,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
1275
1291
}
1276
1292
1277
1293
for (auto & tensor_storage : processed_tensor_storages) {
1278
- // LOG_DEBUG("%s", name.c_str());
1294
+ // LOG_DEBUG("%s", tensor_storage. name.c_str());
1279
1295
1280
1296
ggml_tensor* dst_tensor = NULL ;
1281
1297
0 commit comments