@@ -43,6 +43,7 @@ struct Conditioner {
43
43
// ldm.modules.encoders.modules.FrozenCLIPEmbedder
44
44
// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/sd_hijack_clip.py#L283
45
45
struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
46
+ bool cc_clip_l, cc_clip_g;
46
47
SDVersion version = VERSION_SD1;
47
48
PMVersion pm_version = PM_VERSION_1;
48
49
CLIPTokenizer tokenizer;
@@ -60,26 +61,38 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
60
61
const std::string& embd_dir,
61
62
SDVersion version = VERSION_SD1,
62
63
PMVersion pv = PM_VERSION_1,
63
- int clip_skip = -1 )
64
+ int clip_skip = -1 ,
65
+ bool cc_clip_l = false ,
66
+ bool cc_clip_g = false )
64
67
: version(version), pm_version(pv), tokenizer(version == VERSION_SD2 ? 0 : 49407 ), embd_dir(embd_dir) {
65
68
if (clip_skip <= 0 ) {
66
69
clip_skip = 1 ;
67
70
if (version == VERSION_SD2 || version == VERSION_SDXL || version == VERSION_SDXL_REFINER) {
68
71
clip_skip = 2 ;
69
72
}
70
73
}
74
+ this ->cc_clip_l = cc_clip_l;
75
+ this ->cc_clip_g = cc_clip_g;
71
76
if (version == VERSION_SD1) {
72
- text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.transformer.text_model " , OPENAI_CLIP_VIT_L_14, clip_skip);
77
+ text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, clip_l_prefix () , OPENAI_CLIP_VIT_L_14, clip_skip);
73
78
} else if (version == VERSION_SD2) {
74
- text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.transformer.text_model " , OPEN_CLIP_VIT_H_14, clip_skip);
79
+ text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, clip_l_prefix () , OPEN_CLIP_VIT_H_14, clip_skip);
75
80
} else if (version == VERSION_SDXL) {
76
- text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.transformer.text_model " , OPENAI_CLIP_VIT_L_14, clip_skip, false );
77
- text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.1.transformer.text_model " , OPEN_CLIP_VIT_BIGG_14, clip_skip, false );
81
+ text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, clip_l_prefix () , OPENAI_CLIP_VIT_L_14, clip_skip, false );
82
+ text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, clip_g_prefix () , OPEN_CLIP_VIT_BIGG_14, clip_skip, false );
78
83
} else if (version == VERSION_SDXL_REFINER) {
79
- text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.1.transformer.text_model " , OPEN_CLIP_VIT_BIGG_14, clip_skip, false );
84
+ text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, clip_g_prefix () , OPEN_CLIP_VIT_BIGG_14, clip_skip, false );
80
85
}
81
86
}
82
87
88
+ std::string clip_l_prefix () {
89
+ return cc_clip_l ? " cond_stage_model.transformer.text_model" : " text_encoders.clip_l.transformer.text_model" ;
90
+ }
91
+
92
+ std::string clip_g_prefix () {
93
+ return cc_clip_g ? " cond_stage_model.1.transformer.text_model" : " text_encoders.clip_g.transformer.text_model" ;
94
+ }
95
+
83
96
void set_clip_skip (int clip_skip) {
84
97
if (text_model) {
85
98
text_model->set_clip_skip (clip_skip);
@@ -91,10 +104,10 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
91
104
92
105
void get_param_tensors (std::map<std::string, struct ggml_tensor *>& tensors) {
93
106
if (text_model) {
94
- text_model->get_param_tensors (tensors, " cond_stage_model.transformer.text_model " );
107
+ text_model->get_param_tensors (tensors, clip_l_prefix () );
95
108
}
96
109
if (text_model2) {
97
- text_model2->get_param_tensors (tensors, " cond_stage_model.1.transformer.text_model " );
110
+ text_model2->get_param_tensors (tensors, clip_g_prefix () );
98
111
}
99
112
}
100
113
@@ -603,19 +616,21 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
603
616
};
604
617
605
618
struct FrozenCLIPVisionEmbedder : public GGMLRunner {
619
+ bool cc_clip_l;
606
620
CLIPVisionModelProjection vision_model;
607
621
608
- FrozenCLIPVisionEmbedder (ggml_backend_t backend, std::map<std::string, enum ggml_type>& tensor_types)
622
+ FrozenCLIPVisionEmbedder (ggml_backend_t backend, std::map<std::string, enum ggml_type>& tensor_types, bool cc_clip_l = false )
609
623
: vision_model(OPEN_CLIP_VIT_H_14, true ), GGMLRunner(backend) {
610
- vision_model.init (params_ctx, tensor_types, " cond_stage_model.transformer" );
624
+ this ->cc_clip_l = cc_clip_l;
625
+ vision_model.init (params_ctx, tensor_types, cc_clip_l ? " cond_stage_model.transformer" : " text_encoders.clip_l.transformer" );
611
626
}
612
627
613
628
std::string get_desc () {
614
629
return " clip_vision" ;
615
630
}
616
631
617
632
void get_param_tensors (std::map<std::string, struct ggml_tensor *>& tensors) {
618
- vision_model.get_param_tensors (tensors, " cond_stage_model.transformer" );
633
+ vision_model.get_param_tensors (tensors, cc_clip_l ? " cond_stage_model.transformer " : " text_encoders.clip_l .transformer" );
619
634
}
620
635
621
636
struct ggml_cgraph * build_graph (struct ggml_tensor * pixel_values) {
@@ -642,6 +657,7 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
642
657
};
643
658
644
659
struct SD3CLIPEmbedder : public Conditioner {
660
+ bool cc_clip_l, cc_clip_g, cc_t5xxl;
645
661
CLIPTokenizer clip_l_tokenizer;
646
662
CLIPTokenizer clip_g_tokenizer;
647
663
T5UniGramTokenizer t5_tokenizer;
@@ -651,14 +667,32 @@ struct SD3CLIPEmbedder : public Conditioner {
651
667
652
668
SD3CLIPEmbedder (ggml_backend_t backend,
653
669
std::map<std::string, enum ggml_type>& tensor_types,
654
- int clip_skip = -1 )
670
+ int clip_skip = -1 ,
671
+ bool cc_clip_l = false ,
672
+ bool cc_clip_g = false ,
673
+ bool cc_t5xxl = false )
655
674
: clip_g_tokenizer(0 ) {
656
675
if (clip_skip <= 0 ) {
657
676
clip_skip = 2 ;
658
677
}
659
- clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " text_encoders.clip_l.transformer.text_model" , OPENAI_CLIP_VIT_L_14, clip_skip, false );
660
- clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " text_encoders.clip_g.transformer.text_model" , OPEN_CLIP_VIT_BIGG_14, clip_skip, false );
661
- t5 = std::make_shared<T5Runner>(backend, tensor_types, " text_encoders.t5xxl.transformer" );
678
+ this ->cc_clip_l = cc_clip_l;
679
+ this ->cc_clip_g = cc_clip_g;
680
+ this ->cc_t5xxl = cc_t5xxl;
681
+ clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, clip_l_prefix (), OPENAI_CLIP_VIT_L_14, clip_skip, false );
682
+ clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, clip_g_prefix (), OPEN_CLIP_VIT_BIGG_14, clip_skip, false );
683
+ t5 = std::make_shared<T5Runner>(backend, tensor_types, t5xxl_prefix ());
684
+ }
685
+
686
+ std::string clip_l_prefix () {
687
+ return cc_clip_l ? " cond_stage_model.transformer.text_model" : " text_encoders.clip_l.transformer.text_model" ;
688
+ }
689
+
690
+ std::string clip_g_prefix () {
691
+ return cc_clip_g ? " cond_stage_model.1.transformer.text_model" : " text_encoders.clip_g.transformer.text_model" ;
692
+ }
693
+
694
+ std::string t5xxl_prefix () {
695
+ return cc_t5xxl ? " cond_stage_model.2.transformer" : " text_encoders.t5xxl.transformer" ;
662
696
}
663
697
664
698
void set_clip_skip (int clip_skip) {
@@ -667,9 +701,9 @@ struct SD3CLIPEmbedder : public Conditioner {
667
701
}
668
702
669
703
void get_param_tensors (std::map<std::string, struct ggml_tensor *>& tensors) {
670
- clip_l->get_param_tensors (tensors, " text_encoders.clip_l.transformer.text_model " );
671
- clip_g->get_param_tensors (tensors, " text_encoders.clip_g.transformer.text_model " );
672
- t5->get_param_tensors (tensors, " text_encoders.t5xxl.transformer " );
704
+ clip_l->get_param_tensors (tensors, clip_l_prefix () );
705
+ clip_g->get_param_tensors (tensors, clip_g_prefix () );
706
+ t5->get_param_tensors (tensors, t5xxl_prefix () );
673
707
}
674
708
675
709
void alloc_params_buffer () {
@@ -988,28 +1022,41 @@ struct SD3CLIPEmbedder : public Conditioner {
988
1022
};
989
1023
990
1024
struct FluxCLIPEmbedder : public Conditioner {
1025
+ bool cc_clip_l, cc_t5xxl;
991
1026
CLIPTokenizer clip_l_tokenizer;
992
1027
T5UniGramTokenizer t5_tokenizer;
993
1028
std::shared_ptr<CLIPTextModelRunner> clip_l;
994
1029
std::shared_ptr<T5Runner> t5;
995
1030
996
1031
FluxCLIPEmbedder (ggml_backend_t backend,
997
1032
std::map<std::string, enum ggml_type>& tensor_types,
998
- int clip_skip = -1 ) {
1033
+ int clip_skip = -1 ,
1034
+ bool cc_clip_l = false ,
1035
+ bool cc_t5xxl = false ) {
999
1036
if (clip_skip <= 0 ) {
1000
1037
clip_skip = 2 ;
1001
1038
}
1002
- clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " text_encoders.clip_l.transformer.text_model" , OPENAI_CLIP_VIT_L_14, clip_skip, true );
1003
- t5 = std::make_shared<T5Runner>(backend, tensor_types, " text_encoders.t5xxl.transformer" );
1039
+ this ->cc_clip_l = cc_clip_l;
1040
+ this ->cc_t5xxl = cc_t5xxl;
1041
+ clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, clip_l_prefix (), OPENAI_CLIP_VIT_L_14, clip_skip, true );
1042
+ t5 = std::make_shared<T5Runner>(backend, tensor_types, t5xxl_prefix ());
1043
+ }
1044
+
1045
+ std::string clip_l_prefix () {
1046
+ return cc_clip_l ? " cond_stage_model.transformer.text_model" : " text_encoders.clip_l.transformer.text_model" ;
1047
+ }
1048
+
1049
+ std::string t5xxl_prefix () {
1050
+ return cc_t5xxl ? " cond_stage_model.1.transformer" : " text_encoders.t5xxl.transformer" ;
1004
1051
}
1005
1052
1006
1053
void set_clip_skip (int clip_skip) {
1007
1054
clip_l->set_clip_skip (clip_skip);
1008
1055
}
1009
1056
1010
1057
void get_param_tensors (std::map<std::string, struct ggml_tensor *>& tensors) {
1011
- clip_l->get_param_tensors (tensors, " text_encoders.clip_l.transformer.text_model " );
1012
- t5->get_param_tensors (tensors, " text_encoders.t5xxl.transformer " );
1058
+ clip_l->get_param_tensors (tensors, clip_l_prefix () );
1059
+ t5->get_param_tensors (tensors, t5xxl_prefix () );
1013
1060
}
1014
1061
1015
1062
void alloc_params_buffer () {
0 commit comments