Skip to content

Commit 90f9934

Browse files
committed
refactor(tx): compatible compvis format
Signed-off-by: thxCode <thxcode0824@gmail.com>
1 parent cb48c3c commit 90f9934

File tree

6 files changed

+117
-43
lines changed

6 files changed

+117
-43
lines changed

conditioner.hpp

Lines changed: 66 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ struct Conditioner {
4343
// ldm.modules.encoders.modules.FrozenCLIPEmbedder
4444
// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/sd_hijack_clip.py#L283
4545
struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
46+
bool cc_clip_l, cc_clip_g;
4647
SDVersion version = VERSION_SD1;
4748
PMVersion pm_version = PM_VERSION_1;
4849
CLIPTokenizer tokenizer;
@@ -60,26 +61,38 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
6061
const std::string& embd_dir,
6162
SDVersion version = VERSION_SD1,
6263
PMVersion pv = PM_VERSION_1,
63-
int clip_skip = -1)
64+
int clip_skip = -1,
65+
bool cc_clip_l = false,
66+
bool cc_clip_g = false)
6467
: version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
6568
if (clip_skip <= 0) {
6669
clip_skip = 1;
6770
if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) {
6871
clip_skip = 2;
6972
}
7073
}
74+
this->cc_clip_l = cc_clip_l;
75+
this->cc_clip_g = cc_clip_g;
7176
if (sd_version_is_sd1(version)) {
7277
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip);
7378
} else if (sd_version_is_sd2(version)) {
7479
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, clip_skip);
7580
} else if (sd_version_is_sdxl(version)) {
7681
if (version != VERSION_SDXL_REFINER) {
77-
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false);
82+
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false);
7883
}
7984
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
8085
}
8186
}
8287

88+
std::string clip_l_prefix() {
89+
return cc_clip_l ? "cond_stage_model.transformer.text_model" : "text_encoders.clip_l.transformer.text_model";
90+
}
91+
92+
std::string clip_g_prefix() {
93+
return cc_clip_g ? "cond_stage_model.1.transformer.text_model" : "text_encoders.clip_g.transformer.text_model";
94+
}
95+
8396
void set_clip_skip(int clip_skip) {
8497
if (text_model) {
8598
text_model->set_clip_skip(clip_skip);
@@ -91,10 +104,10 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
91104

92105
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
93106
if (text_model) {
94-
text_model->get_param_tensors(tensors, "cond_stage_model.transformer.text_model");
107+
text_model->get_param_tensors(tensors, clip_l_prefix());
95108
}
96109
if (text_model2) {
97-
text_model2->get_param_tensors(tensors, "cond_stage_model.1.transformer.text_model");
110+
text_model2->get_param_tensors(tensors, clip_g_prefix());
98111
}
99112
}
100113

@@ -600,19 +613,21 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
600613
};
601614

602615
struct FrozenCLIPVisionEmbedder : public GGMLRunner {
616+
bool cc_clip_l;
603617
CLIPVisionModelProjection vision_model;
604618

605-
FrozenCLIPVisionEmbedder(ggml_backend_t backend, std::map<std::string, enum ggml_type>& tensor_types)
619+
FrozenCLIPVisionEmbedder(ggml_backend_t backend, std::map<std::string, enum ggml_type>& tensor_types, bool cc_clip_l = false)
606620
: vision_model(OPEN_CLIP_VIT_H_14, true), GGMLRunner(backend) {
607-
vision_model.init(params_ctx, tensor_types, "cond_stage_model.transformer");
621+
this->cc_clip_l = cc_clip_l;
622+
vision_model.init(params_ctx, tensor_types, cc_clip_l ? "cond_stage_model.transformer" : "text_encoders.clip_l.transformer");
608623
}
609624

610625
std::string get_desc() {
611626
return "clip_vision";
612627
}
613628

614629
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
615-
vision_model.get_param_tensors(tensors, "cond_stage_model.transformer");
630+
vision_model.get_param_tensors(tensors, cc_clip_l ? "cond_stage_model.transformer" : "text_encoders.clip_l.transformer");
616631
}
617632

618633
struct ggml_cgraph* build_graph(struct ggml_tensor* pixel_values) {
@@ -639,6 +654,7 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
639654
};
640655

641656
struct SD3CLIPEmbedder : public Conditioner {
657+
bool cc_clip_l, cc_clip_g, cc_t5xxl;
642658
CLIPTokenizer clip_l_tokenizer;
643659
CLIPTokenizer clip_g_tokenizer;
644660
T5UniGramTokenizer t5_tokenizer;
@@ -648,14 +664,32 @@ struct SD3CLIPEmbedder : public Conditioner {
648664

649665
SD3CLIPEmbedder(ggml_backend_t backend,
650666
std::map<std::string, enum ggml_type>& tensor_types,
651-
int clip_skip = -1)
667+
int clip_skip = -1,
668+
bool cc_clip_l = false,
669+
bool cc_clip_g = false,
670+
bool cc_t5xxl = false)
652671
: clip_g_tokenizer(0) {
653672
if (clip_skip <= 0) {
654673
clip_skip = 2;
655674
}
656-
clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false);
657-
clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
658-
t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
675+
this->cc_clip_l = cc_clip_l;
676+
this->cc_clip_g = cc_clip_g;
677+
this->cc_t5xxl = cc_t5xxl;
678+
clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, clip_l_prefix(), OPENAI_CLIP_VIT_L_14, clip_skip, false);
679+
clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, clip_g_prefix(), OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
680+
t5 = std::make_shared<T5Runner>(backend, tensor_types, t5xxl_prefix());
681+
}
682+
683+
std::string clip_l_prefix() {
684+
return cc_clip_l ? "cond_stage_model.transformer.text_model" : "text_encoders.clip_l.transformer.text_model";
685+
}
686+
687+
std::string clip_g_prefix() {
688+
return cc_clip_g ? "cond_stage_model.1.transformer.text_model" : "text_encoders.clip_g.transformer.text_model";
689+
}
690+
691+
std::string t5xxl_prefix() {
692+
return cc_t5xxl ? "cond_stage_model.2.transformer" : "text_encoders.t5xxl.transformer";
659693
}
660694

661695
void set_clip_skip(int clip_skip) {
@@ -664,9 +698,9 @@ struct SD3CLIPEmbedder : public Conditioner {
664698
}
665699

666700
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
667-
clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model");
668-
clip_g->get_param_tensors(tensors, "text_encoders.clip_g.transformer.text_model");
669-
t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
701+
clip_l->get_param_tensors(tensors, clip_l_prefix());
702+
clip_g->get_param_tensors(tensors, clip_g_prefix());
703+
t5->get_param_tensors(tensors, t5xxl_prefix());
670704
}
671705

672706
void alloc_params_buffer() {
@@ -985,28 +1019,41 @@ struct SD3CLIPEmbedder : public Conditioner {
9851019
};
9861020

9871021
struct FluxCLIPEmbedder : public Conditioner {
1022+
bool cc_clip_l, cc_t5xxl;
9881023
CLIPTokenizer clip_l_tokenizer;
9891024
T5UniGramTokenizer t5_tokenizer;
9901025
std::shared_ptr<CLIPTextModelRunner> clip_l;
9911026
std::shared_ptr<T5Runner> t5;
9921027

9931028
FluxCLIPEmbedder(ggml_backend_t backend,
9941029
std::map<std::string, enum ggml_type>& tensor_types,
995-
int clip_skip = -1) {
1030+
int clip_skip = -1,
1031+
bool cc_clip_l = false,
1032+
bool cc_t5xxl = false) {
9961033
if (clip_skip <= 0) {
9971034
clip_skip = 2;
9981035
}
999-
clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, true);
1000-
t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
1036+
this->cc_clip_l = cc_clip_l;
1037+
this->cc_t5xxl = cc_t5xxl;
1038+
clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, clip_l_prefix(), OPENAI_CLIP_VIT_L_14, clip_skip, true);
1039+
t5 = std::make_shared<T5Runner>(backend, tensor_types, t5xxl_prefix());
1040+
}
1041+
1042+
std::string clip_l_prefix() {
1043+
return cc_clip_l ? "cond_stage_model.transformer.text_model" : "text_encoders.clip_l.transformer.text_model";
1044+
}
1045+
1046+
std::string t5xxl_prefix() {
1047+
return cc_t5xxl ? "cond_stage_model.1.transformer" : "text_encoders.t5xxl.transformer";
10011048
}
10021049

10031050
void set_clip_skip(int clip_skip) {
10041051
clip_l->set_clip_skip(clip_skip);
10051052
}
10061053

10071054
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
1008-
clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model");
1009-
t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
1055+
clip_l->get_param_tensors(tensors, clip_l_prefix());
1056+
t5->get_param_tensors(tensors, t5xxl_prefix());
10101057
}
10111058

10121059
void alloc_params_buffer() {

model.cpp

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1469,11 +1469,11 @@ SDVersion ModelLoader::get_sd_version() {
14691469
TensorStorage token_embedding_weight, input_block_weight;
14701470
bool input_block_checked = false;
14711471

1472-
bool has_multiple_encoders = false;
1473-
bool is_unet = false;
1472+
bool has_multiple_encoders = false;
1473+
bool is_unet = false;
14741474

1475-
bool is_xl = false;
1476-
bool is_flux = false;
1475+
bool is_xl = false;
1476+
bool is_flux = false;
14771477
bool is_refiner = false;
14781478

14791479
#define found_family (is_xl || is_flux)
@@ -1490,7 +1490,7 @@ SDVersion ModelLoader::get_sd_version() {
14901490
}
14911491
if (tensor_storage.name.find("model.diffusion_model.input_blocks.") != std::string::npos) {
14921492
is_unet = true;
1493-
if(has_multiple_encoders){
1493+
if (has_multiple_encoders) {
14941494
is_xl = true;
14951495
if (input_block_checked) {
14961496
break;
@@ -1499,7 +1499,7 @@ SDVersion ModelLoader::get_sd_version() {
14991499
}
15001500
if (tensor_storage.name.find("conditioner.embedders.1") != std::string::npos || tensor_storage.name.find("cond_stage_model.1") != std::string::npos) {
15011501
has_multiple_encoders = true;
1502-
if(is_unet){
1502+
if (is_unet) {
15031503
is_xl = true;
15041504
if (input_block_checked) {
15051505
break;
@@ -2037,6 +2037,15 @@ int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type)
20372037
return mem_size;
20382038
}
20392039

2040+
bool ModelLoader::has_prefix_tensors(const std::string& prefix) {
2041+
for (auto& tensor_storage : tensor_storages) {
2042+
if (tensor_storage.name.find(prefix) != std::string::npos) {
2043+
return true;
2044+
}
2045+
}
2046+
return false;
2047+
}
2048+
20402049
bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type) {
20412050
ModelLoader model_loader;
20422051

model.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,7 @@ class ModelLoader {
228228

229229
static std::string load_merges();
230230
static std::string load_t5_tokenizer_json();
231+
bool has_prefix_tensors(const std::string& prefix);
231232
};
232233

233234
#endif // __MODEL_H__

stable-diffusion.cpp

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -431,19 +431,24 @@ class StableDiffusionGGML {
431431
// TODO: shift_factor
432432
}
433433

434+
auto cc_clip_l = model_loader.has_prefix_tensors("cond_stage_model.") && !model_loader.has_prefix_tensors("text_encoders.clip_l.");
435+
auto cc_clip_g = model_loader.has_prefix_tensors("cond_stage_model.") && !model_loader.has_prefix_tensors("text_encoders.clip_g.");
436+
auto cc_t5xxl = model_loader.has_prefix_tensors("cond_stage_model.") && !model_loader.has_prefix_tensors("text_encoders.t5xxl.");
437+
auto cc_vae = model_loader.has_prefix_tensors("first_stage_model.") && !model_loader.has_prefix_tensors("vae.");
438+
434439
if (version == VERSION_SVD) {
435-
clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(backend, model_loader.tensor_storages_types);
440+
clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(backend, model_loader.tensor_storages_types, cc_clip_l);
436441
clip_vision->alloc_params_buffer();
437442
clip_vision->get_param_tensors(tensors);
438443

439444
diffusion_model = std::make_shared<UNetModel>(backend, model_loader.tensor_storages_types, version);
440445
diffusion_model->alloc_params_buffer();
441446
diffusion_model->get_param_tensors(tensors);
442447

443-
first_stage_model = std::make_shared<AutoEncoderKL>(backend, model_loader.tensor_storages_types, "first_stage_model", vae_decode_only, true, version);
448+
first_stage_model = std::make_shared<AutoEncoderKL>(backend, model_loader.tensor_storages_types, vae_decode_only, true, version, cc_vae);
444449
LOG_DEBUG("vae_decode_only %d", vae_decode_only);
445450
first_stage_model->alloc_params_buffer();
446-
first_stage_model->get_param_tensors(tensors, "first_stage_model");
451+
first_stage_model->get_param_tensors(tensors);
447452
} else {
448453
clip_backend = backend;
449454
if (clip_on_cpu && !ggml_backend_is_cpu(backend)) {
@@ -457,16 +462,16 @@ class StableDiffusionGGML {
457462
if (diffusion_flash_attn) {
458463
LOG_WARN("flash attention in this diffusion model is currently unsupported!");
459464
}
460-
cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
465+
cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend, model_loader.tensor_storages_types, -1, cc_clip_l, cc_clip_g, cc_t5xxl);
461466
diffusion_model = std::make_shared<MMDiTModel>(backend, model_loader.tensor_storages_types);
462467
} else if (sd_version_is_flux(version)) {
463-
cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
468+
cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types, -1, cc_clip_l, cc_t5xxl);
464469
diffusion_model = std::make_shared<FluxModel>(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn);
465470
} else {
466471
if (id_embeddings_path.find("v2") != std::string::npos) {
467-
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_loader.tensor_storages_types, embeddings_path, version, PM_VERSION_2);
472+
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_loader.tensor_storages_types, embeddings_path, version, PM_VERSION_2, -1, cc_clip_l, cc_clip_g);
468473
} else {
469-
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_loader.tensor_storages_types, embeddings_path, version);
474+
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_loader.tensor_storages_types, embeddings_path, version, PM_VERSION_1, -1, cc_clip_l, cc_clip_g);
470475
}
471476
diffusion_model = std::make_shared<UNetModel>(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn);
472477
}
@@ -484,12 +489,12 @@ class StableDiffusionGGML {
484489
} else {
485490
vae_backend = backend;
486491
}
487-
first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend, model_loader.tensor_storages_types, "first_stage_model", vae_decode_only, false, version);
492+
first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend, model_loader.tensor_storages_types, vae_decode_only, false, version, cc_vae);
488493
first_stage_model->alloc_params_buffer();
489-
first_stage_model->get_param_tensors(tensors, "first_stage_model");
494+
first_stage_model->get_param_tensors(tensors);
490495
}
491496
if (use_tiny_autoencoder) {
492-
tae_first_stage = std::make_shared<TinyAutoEncoder>(backend, model_loader.tensor_storages_types, "decoder.layers", vae_decode_only, version);
497+
tae_first_stage = std::make_shared<TinyAutoEncoder>(backend, model_loader.tensor_storages_types, vae_decode_only, version, cc_vae);
493498
}
494499
// first_stage_model->get_param_tensors(tensors, "first_stage_model.");
495500

tae.hpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -192,18 +192,24 @@ class TAESD : public GGMLBlock {
192192
};
193193

194194
struct TinyAutoEncoder : public GGMLRunner {
195+
bool cc_vae;
195196
TAESD taesd;
196197
bool decode_only = false;
197198

198199
TinyAutoEncoder(ggml_backend_t backend,
199200
std::map<std::string, enum ggml_type>& tensor_types,
200-
const std::string prefix,
201201
bool decoder_only = true,
202-
SDVersion version = VERSION_SD1)
202+
SDVersion version = VERSION_SD1,
203+
bool cc_vae = false)
203204
: decode_only(decoder_only),
204205
taesd(decode_only, version),
205206
GGMLRunner(backend) {
206-
taesd.init(params_ctx, tensor_types, prefix);
207+
this->cc_vae = cc_vae;
208+
taesd.init(params_ctx, tensor_types, vae_prefix());
209+
}
210+
211+
std::string vae_prefix() {
212+
return cc_vae ? "first_stage_model" : "";
207213
}
208214

209215
std::string get_desc() {

vae.hpp

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -521,25 +521,31 @@ class AutoencodingEngine : public GGMLBlock {
521521
};
522522

523523
struct AutoEncoderKL : public GGMLRunner {
524+
bool cc_vae;
524525
bool decode_only = true;
525526
AutoencodingEngine ae;
526527

527528
AutoEncoderKL(ggml_backend_t backend,
528529
std::map<std::string, enum ggml_type>& tensor_types,
529-
const std::string prefix,
530530
bool decode_only = false,
531531
bool use_video_decoder = false,
532-
SDVersion version = VERSION_SD1)
532+
SDVersion version = VERSION_SD1,
533+
bool cc_vae = false)
533534
: decode_only(decode_only), ae(decode_only, use_video_decoder, version), GGMLRunner(backend) {
534-
ae.init(params_ctx, tensor_types, prefix);
535+
this->cc_vae = cc_vae;
536+
ae.init(params_ctx, tensor_types, vae_prefix());
537+
}
538+
539+
std::string vae_prefix() {
540+
return cc_vae ? "first_stage_model" : "";
535541
}
536542

537543
std::string get_desc() {
538544
return "vae";
539545
}
540546

541-
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
542-
ae.get_param_tensors(tensors, prefix);
547+
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
548+
ae.get_param_tensors(tensors, vae_prefix());
543549
}
544550

545551
struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {

0 commit comments

Comments
 (0)