Skip to content

Commit 43cbdb4

Browse files
committed
refactor(tx): compatible compvis format
Signed-off-by: thxCode <thxcode0824@gmail.com>
1 parent a660547 commit 43cbdb4

File tree

6 files changed

+115
-41
lines changed

6 files changed

+115
-41
lines changed

conditioner.hpp

Lines changed: 70 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ struct Conditioner {
4343
// ldm.modules.encoders.modules.FrozenCLIPEmbedder
4444
// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/sd_hijack_clip.py#L283
4545
struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
46+
bool cc_clip_l, cc_clip_g;
4647
SDVersion version = VERSION_SD1;
4748
PMVersion pm_version = PM_VERSION_1;
4849
CLIPTokenizer tokenizer;
@@ -60,26 +61,38 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
6061
const std::string& embd_dir,
6162
SDVersion version = VERSION_SD1,
6263
PMVersion pv = PM_VERSION_1,
63-
int clip_skip = -1)
64+
int clip_skip = -1,
65+
bool cc_clip_l = false,
66+
bool cc_clip_g = false)
6467
: version(version), pm_version(pv), tokenizer(version == VERSION_SD2 ? 0 : 49407), embd_dir(embd_dir) {
6568
if (clip_skip <= 0) {
6669
clip_skip = 1;
6770
if (version == VERSION_SD2 || version == VERSION_SDXL || version == VERSION_SDXL_REFINER) {
6871
clip_skip = 2;
6972
}
7073
}
74+
this->cc_clip_l = cc_clip_l;
75+
this->cc_clip_g = cc_clip_g;
7176
if (version == VERSION_SD1) {
72-
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip);
77+
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, clip_l_prefix(), OPENAI_CLIP_VIT_L_14, clip_skip);
7378
} else if (version == VERSION_SD2) {
74-
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, clip_skip);
79+
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, clip_l_prefix(), OPEN_CLIP_VIT_H_14, clip_skip);
7580
} else if (version == VERSION_SDXL) {
76-
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false);
77-
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
81+
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, clip_l_prefix(), OPENAI_CLIP_VIT_L_14, clip_skip, false);
82+
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, clip_g_prefix(), OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
7883
} else if (version == VERSION_SDXL_REFINER) {
79-
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
84+
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, clip_g_prefix(), OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
8085
}
8186
}
8287

88+
std::string clip_l_prefix() {
89+
return cc_clip_l ? "cond_stage_model.transformer.text_model" : "text_encoders.clip_l.transformer.text_model";
90+
}
91+
92+
std::string clip_g_prefix() {
93+
return cc_clip_g ? "cond_stage_model.1.transformer.text_model" : "text_encoders.clip_g.transformer.text_model";
94+
}
95+
8396
void set_clip_skip(int clip_skip) {
8497
if (text_model) {
8598
text_model->set_clip_skip(clip_skip);
@@ -91,10 +104,10 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
91104

92105
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
93106
if (text_model) {
94-
text_model->get_param_tensors(tensors, "cond_stage_model.transformer.text_model");
107+
text_model->get_param_tensors(tensors, clip_l_prefix());
95108
}
96109
if (text_model2) {
97-
text_model2->get_param_tensors(tensors, "cond_stage_model.1.transformer.text_model");
110+
text_model2->get_param_tensors(tensors, clip_g_prefix());
98111
}
99112
}
100113

@@ -603,19 +616,21 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
603616
};
604617

605618
struct FrozenCLIPVisionEmbedder : public GGMLRunner {
619+
bool cc_clip_l;
606620
CLIPVisionModelProjection vision_model;
607621

608-
FrozenCLIPVisionEmbedder(ggml_backend_t backend, std::map<std::string, enum ggml_type>& tensor_types)
622+
FrozenCLIPVisionEmbedder(ggml_backend_t backend, std::map<std::string, enum ggml_type>& tensor_types, bool cc_clip_l = false)
609623
: vision_model(OPEN_CLIP_VIT_H_14, true), GGMLRunner(backend) {
610-
vision_model.init(params_ctx, tensor_types, "cond_stage_model.transformer");
624+
this->cc_clip_l = cc_clip_l;
625+
vision_model.init(params_ctx, tensor_types, cc_clip_l ? "cond_stage_model.transformer" : "text_encoders.clip_l.transformer");
611626
}
612627

613628
std::string get_desc() {
614629
return "clip_vision";
615630
}
616631

617632
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
618-
vision_model.get_param_tensors(tensors, "cond_stage_model.transformer");
633+
vision_model.get_param_tensors(tensors, cc_clip_l ? "cond_stage_model.transformer" : "text_encoders.clip_l.transformer");
619634
}
620635

621636
struct ggml_cgraph* build_graph(struct ggml_tensor* pixel_values) {
@@ -642,6 +657,7 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
642657
};
643658

644659
struct SD3CLIPEmbedder : public Conditioner {
660+
bool cc_clip_l, cc_clip_g, cc_t5xxl;
645661
CLIPTokenizer clip_l_tokenizer;
646662
CLIPTokenizer clip_g_tokenizer;
647663
T5UniGramTokenizer t5_tokenizer;
@@ -651,14 +667,32 @@ struct SD3CLIPEmbedder : public Conditioner {
651667

652668
SD3CLIPEmbedder(ggml_backend_t backend,
653669
std::map<std::string, enum ggml_type>& tensor_types,
654-
int clip_skip = -1)
670+
int clip_skip = -1,
671+
bool cc_clip_l = false,
672+
bool cc_clip_g = false,
673+
bool cc_t5xxl = false)
655674
: clip_g_tokenizer(0) {
656675
if (clip_skip <= 0) {
657676
clip_skip = 2;
658677
}
659-
clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false);
660-
clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
661-
t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
678+
this->cc_clip_l = cc_clip_l;
679+
this->cc_clip_g = cc_clip_g;
680+
this->cc_t5xxl = cc_t5xxl;
681+
clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, clip_l_prefix(), OPENAI_CLIP_VIT_L_14, clip_skip, false);
682+
clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, clip_g_prefix(), OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
683+
t5 = std::make_shared<T5Runner>(backend, tensor_types, t5xxl_prefix());
684+
}
685+
686+
std::string clip_l_prefix() {
687+
return cc_clip_l ? "cond_stage_model.transformer.text_model" : "text_encoders.clip_l.transformer.text_model";
688+
}
689+
690+
std::string clip_g_prefix() {
691+
return cc_clip_g ? "cond_stage_model.1.transformer.text_model" : "text_encoders.clip_g.transformer.text_model";
692+
}
693+
694+
std::string t5xxl_prefix() {
695+
return cc_t5xxl ? "cond_stage_model.2.transformer" : "text_encoders.t5xxl.transformer";
662696
}
663697

664698
void set_clip_skip(int clip_skip) {
@@ -667,9 +701,9 @@ struct SD3CLIPEmbedder : public Conditioner {
667701
}
668702

669703
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
670-
clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model");
671-
clip_g->get_param_tensors(tensors, "text_encoders.clip_g.transformer.text_model");
672-
t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
704+
clip_l->get_param_tensors(tensors, clip_l_prefix());
705+
clip_g->get_param_tensors(tensors, clip_g_prefix());
706+
t5->get_param_tensors(tensors, t5xxl_prefix());
673707
}
674708

675709
void alloc_params_buffer() {
@@ -988,28 +1022,41 @@ struct SD3CLIPEmbedder : public Conditioner {
9881022
};
9891023

9901024
struct FluxCLIPEmbedder : public Conditioner {
1025+
bool cc_clip_l, cc_t5xxl;
9911026
CLIPTokenizer clip_l_tokenizer;
9921027
T5UniGramTokenizer t5_tokenizer;
9931028
std::shared_ptr<CLIPTextModelRunner> clip_l;
9941029
std::shared_ptr<T5Runner> t5;
9951030

9961031
FluxCLIPEmbedder(ggml_backend_t backend,
9971032
std::map<std::string, enum ggml_type>& tensor_types,
998-
int clip_skip = -1) {
1033+
int clip_skip = -1,
1034+
bool cc_clip_l = false,
1035+
bool cc_t5xxl = false) {
9991036
if (clip_skip <= 0) {
10001037
clip_skip = 2;
10011038
}
1002-
clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, true);
1003-
t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
1039+
this->cc_clip_l = cc_clip_l;
1040+
this->cc_t5xxl = cc_t5xxl;
1041+
clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, clip_l_prefix(), OPENAI_CLIP_VIT_L_14, clip_skip, true);
1042+
t5 = std::make_shared<T5Runner>(backend, tensor_types, t5xxl_prefix());
1043+
}
1044+
1045+
std::string clip_l_prefix() {
1046+
return cc_clip_l ? "cond_stage_model.transformer.text_model" : "text_encoders.clip_l.transformer.text_model";
1047+
}
1048+
1049+
std::string t5xxl_prefix() {
1050+
return cc_t5xxl ? "cond_stage_model.1.transformer" : "text_encoders.t5xxl.transformer";
10041051
}
10051052

10061053
void set_clip_skip(int clip_skip) {
10071054
clip_l->set_clip_skip(clip_skip);
10081055
}
10091056

10101057
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
1011-
clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model");
1012-
t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
1058+
clip_l->get_param_tensors(tensors, clip_l_prefix());
1059+
t5->get_param_tensors(tensors, t5xxl_prefix());
10131060
}
10141061

10151062
void alloc_params_buffer() {

model.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1985,6 +1985,15 @@ int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type)
19851985
return mem_size;
19861986
}
19871987

1988+
bool ModelLoader::has_prefix_tensors(const std::string& prefix) {
1989+
for (auto& tensor_storage : tensor_storages) {
1990+
if (tensor_storage.name.find(prefix) != std::string::npos) {
1991+
return true;
1992+
}
1993+
}
1994+
return false;
1995+
}
1996+
19881997
bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type) {
19891998
ModelLoader model_loader;
19901999

model.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ class ModelLoader {
196196

197197
static std::string load_merges();
198198
static std::string load_t5_tokenizer_json();
199+
bool has_prefix_tensors(const std::string& prefix);
199200
};
200201

201202
#endif // __MODEL_H__

stable-diffusion.cpp

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -374,19 +374,24 @@ class StableDiffusionGGML {
374374
// TODO: shift_factor
375375
}
376376

377+
auto cc_clip_l = model_loader.has_prefix_tensors("cond_stage_model.") && !model_loader.has_prefix_tensors("text_encoders.clip_l.");
378+
auto cc_clip_g = model_loader.has_prefix_tensors("cond_stage_model.") && !model_loader.has_prefix_tensors("text_encoders.clip_g.");
379+
auto cc_t5xxl = model_loader.has_prefix_tensors("cond_stage_model.") && !model_loader.has_prefix_tensors("text_encoders.t5xxl.");
380+
auto cc_vae = model_loader.has_prefix_tensors("first_stage_model.") && !model_loader.has_prefix_tensors("vae.");
381+
377382
if (version == VERSION_SVD) {
378-
clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(backend, model_loader.tensor_storages_types);
383+
clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(backend, model_loader.tensor_storages_types, cc_clip_l);
379384
clip_vision->alloc_params_buffer();
380385
clip_vision->get_param_tensors(tensors);
381386

382387
diffusion_model = std::make_shared<UNetModel>(backend, model_loader.tensor_storages_types, version);
383388
diffusion_model->alloc_params_buffer();
384389
diffusion_model->get_param_tensors(tensors);
385390

386-
first_stage_model = std::make_shared<AutoEncoderKL>(backend, model_loader.tensor_storages_types, "first_stage_model", vae_decode_only, true, version);
391+
first_stage_model = std::make_shared<AutoEncoderKL>(backend, model_loader.tensor_storages_types, vae_decode_only, true, version, cc_vae);
387392
LOG_DEBUG("vae_decode_only %d", vae_decode_only);
388393
first_stage_model->alloc_params_buffer();
389-
first_stage_model->get_param_tensors(tensors, "first_stage_model");
394+
first_stage_model->get_param_tensors(tensors);
390395
} else {
391396
clip_backend = backend;
392397
bool use_t5xxl = false;
@@ -408,16 +413,16 @@ class StableDiffusionGGML {
408413
if (diffusion_flash_attn) {
409414
LOG_WARN("flash attention in this diffusion model is currently unsupported!");
410415
}
411-
cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
416+
cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend, model_loader.tensor_storages_types, -1, cc_clip_l, cc_clip_g, cc_t5xxl);
412417
diffusion_model = std::make_shared<MMDiTModel>(backend, model_loader.tensor_storages_types);
413418
} else if (sd_version_is_flux(version)) {
414-
cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
419+
cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types, -1, cc_clip_l, cc_t5xxl);
415420
diffusion_model = std::make_shared<FluxModel>(backend, model_loader.tensor_storages_types, diffusion_flash_attn);
416421
} else {
417422
if (id_embeddings_path.find("v2") != std::string::npos) {
418-
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_loader.tensor_storages_types, embeddings_path, version, PM_VERSION_2);
423+
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_loader.tensor_storages_types, embeddings_path, version, PM_VERSION_2, -1, cc_clip_l, cc_clip_g);
419424
} else {
420-
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_loader.tensor_storages_types, embeddings_path, version);
425+
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_loader.tensor_storages_types, embeddings_path, version, PM_VERSION_1, -1, cc_clip_l, cc_clip_g);
421426
}
422427
diffusion_model = std::make_shared<UNetModel>(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn);
423428
}
@@ -435,11 +440,11 @@ class StableDiffusionGGML {
435440
} else {
436441
vae_backend = backend;
437442
}
438-
first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend, model_loader.tensor_storages_types, "first_stage_model", vae_decode_only, false, version);
443+
first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend, model_loader.tensor_storages_types, vae_decode_only, false, version, cc_vae);
439444
first_stage_model->alloc_params_buffer();
440-
first_stage_model->get_param_tensors(tensors, "first_stage_model");
445+
first_stage_model->get_param_tensors(tensors);
441446
} else {
442-
tae_first_stage = std::make_shared<TinyAutoEncoder>(backend, model_loader.tensor_storages_types, "decoder.layers", vae_decode_only);
447+
tae_first_stage = std::make_shared<TinyAutoEncoder>(backend, model_loader.tensor_storages_types, vae_decode_only, cc_vae);
443448
}
444449
// first_stage_model->get_param_tensors(tensors, "first_stage_model.");
445450

tae.hpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -184,17 +184,23 @@ class TAESD : public GGMLBlock {
184184
};
185185

186186
struct TinyAutoEncoder : public GGMLRunner {
187+
bool cc_vae;
187188
TAESD taesd;
188189
bool decode_only = false;
189190

190191
TinyAutoEncoder(ggml_backend_t backend,
191192
std::map<std::string, enum ggml_type>& tensor_types,
192-
const std::string prefix,
193-
bool decoder_only = true)
193+
bool decoder_only = true,
194+
bool cc_vae = false)
194195
: decode_only(decoder_only),
195196
taesd(decode_only),
196197
GGMLRunner(backend) {
197-
taesd.init(params_ctx, tensor_types, prefix);
198+
this->cc_vae = cc_vae;
199+
taesd.init(params_ctx, tensor_types, vae_prefix());
200+
}
201+
202+
std::string vae_prefix() {
203+
return cc_vae ? "first_stage_model" : "";
198204
}
199205

200206
std::string get_desc() {

vae.hpp

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -521,25 +521,31 @@ class AutoencodingEngine : public GGMLBlock {
521521
};
522522

523523
struct AutoEncoderKL : public GGMLRunner {
524+
bool cc_vae;
524525
bool decode_only = true;
525526
AutoencodingEngine ae;
526527

527528
AutoEncoderKL(ggml_backend_t backend,
528529
std::map<std::string, enum ggml_type>& tensor_types,
529-
const std::string prefix,
530530
bool decode_only = false,
531531
bool use_video_decoder = false,
532-
SDVersion version = VERSION_SD1)
532+
SDVersion version = VERSION_SD1,
533+
bool cc_vae = false)
533534
: decode_only(decode_only), ae(decode_only, use_video_decoder, version), GGMLRunner(backend) {
534-
ae.init(params_ctx, tensor_types, prefix);
535+
this->cc_vae = cc_vae;
536+
ae.init(params_ctx, tensor_types, vae_prefix());
537+
}
538+
539+
std::string vae_prefix() {
540+
return cc_vae ? "first_stage_model" : "";
535541
}
536542

537543
std::string get_desc() {
538544
return "vae";
539545
}
540546

541-
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
542-
ae.get_param_tensors(tensors, prefix);
547+
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
548+
ae.get_param_tensors(tensors, vae_prefix());
543549
}
544550

545551
struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {

0 commit comments

Comments
 (0)