@@ -43,6 +43,7 @@ struct Conditioner {
43
43
// ldm.modules.encoders.modules.FrozenCLIPEmbedder
44
44
// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/sd_hijack_clip.py#L283
45
45
struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
46
+ bool cc_clip_l, cc_clip_g;
46
47
SDVersion version = VERSION_SD1;
47
48
PMVersion pm_version = PM_VERSION_1;
48
49
CLIPTokenizer tokenizer;
@@ -60,26 +61,38 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
60
61
const std::string& embd_dir,
61
62
SDVersion version = VERSION_SD1,
62
63
PMVersion pv = PM_VERSION_1,
63
- int clip_skip = -1 )
64
+ int clip_skip = -1 ,
65
+ bool cc_clip_l = false ,
66
+ bool cc_clip_g = false )
64
67
: version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407 ), embd_dir(embd_dir) {
65
68
if (clip_skip <= 0 ) {
66
69
clip_skip = 1 ;
67
70
if (sd_version_is_sd2 (version) || sd_version_is_sdxl (version)) {
68
71
clip_skip = 2 ;
69
72
}
70
73
}
74
+ this ->cc_clip_l = cc_clip_l;
75
+ this ->cc_clip_g = cc_clip_g;
71
76
if (sd_version_is_sd1 (version)) {
72
77
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.transformer.text_model" , OPENAI_CLIP_VIT_L_14, clip_skip);
73
78
} else if (sd_version_is_sd2 (version)) {
74
79
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.transformer.text_model" , OPEN_CLIP_VIT_H_14, clip_skip);
75
80
} else if (sd_version_is_sdxl (version)) {
76
81
if (version != VERSION_SDXL_REFINER) {
77
- text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.transformer.text_model" , OPENAI_CLIP_VIT_L_14, clip_skip, false );
82
+ text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.transformer.text_model" , OPENAI_CLIP_VIT_L_14, clip_skip, false );
78
83
}
79
84
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.1.transformer.text_model" , OPEN_CLIP_VIT_BIGG_14, clip_skip, false );
80
85
}
81
86
}
82
87
88
+ std::string clip_l_prefix () {
89
+ return cc_clip_l ? " cond_stage_model.transformer.text_model" : " text_encoders.clip_l.transformer.text_model" ;
90
+ }
91
+
92
+ std::string clip_g_prefix () {
93
+ return cc_clip_g ? " cond_stage_model.1.transformer.text_model" : " text_encoders.clip_g.transformer.text_model" ;
94
+ }
95
+
83
96
void set_clip_skip (int clip_skip) {
84
97
if (text_model) {
85
98
text_model->set_clip_skip (clip_skip);
@@ -91,10 +104,10 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
91
104
92
105
void get_param_tensors (std::map<std::string, struct ggml_tensor *>& tensors) {
93
106
if (text_model) {
94
- text_model->get_param_tensors (tensors, " cond_stage_model.transformer.text_model " );
107
+ text_model->get_param_tensors (tensors, clip_l_prefix () );
95
108
}
96
109
if (text_model2) {
97
- text_model2->get_param_tensors (tensors, " cond_stage_model.1.transformer.text_model " );
110
+ text_model2->get_param_tensors (tensors, clip_g_prefix () );
98
111
}
99
112
}
100
113
@@ -600,19 +613,21 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
600
613
};
601
614
602
615
struct FrozenCLIPVisionEmbedder : public GGMLRunner {
616
+ bool cc_clip_l;
603
617
CLIPVisionModelProjection vision_model;
604
618
605
- FrozenCLIPVisionEmbedder (ggml_backend_t backend, std::map<std::string, enum ggml_type>& tensor_types)
619
+ FrozenCLIPVisionEmbedder (ggml_backend_t backend, std::map<std::string, enum ggml_type>& tensor_types, bool cc_clip_l = false )
606
620
: vision_model(OPEN_CLIP_VIT_H_14, true ), GGMLRunner(backend) {
607
- vision_model.init (params_ctx, tensor_types, " cond_stage_model.transformer" );
621
+ this ->cc_clip_l = cc_clip_l;
622
+ vision_model.init (params_ctx, tensor_types, cc_clip_l ? " cond_stage_model.transformer" : " text_encoders.clip_l.transformer" );
608
623
}
609
624
610
625
std::string get_desc () {
611
626
return " clip_vision" ;
612
627
}
613
628
614
629
void get_param_tensors (std::map<std::string, struct ggml_tensor *>& tensors) {
615
- vision_model.get_param_tensors (tensors, " cond_stage_model.transformer" );
630
+ vision_model.get_param_tensors (tensors, cc_clip_l ? " cond_stage_model.transformer " : " text_encoders.clip_l .transformer" );
616
631
}
617
632
618
633
struct ggml_cgraph * build_graph (struct ggml_tensor * pixel_values) {
@@ -639,6 +654,7 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
639
654
};
640
655
641
656
struct SD3CLIPEmbedder : public Conditioner {
657
+ bool cc_clip_l, cc_clip_g, cc_t5xxl;
642
658
CLIPTokenizer clip_l_tokenizer;
643
659
CLIPTokenizer clip_g_tokenizer;
644
660
T5UniGramTokenizer t5_tokenizer;
@@ -648,14 +664,32 @@ struct SD3CLIPEmbedder : public Conditioner {
648
664
649
665
SD3CLIPEmbedder (ggml_backend_t backend,
650
666
std::map<std::string, enum ggml_type>& tensor_types,
651
- int clip_skip = -1 )
667
+ int clip_skip = -1 ,
668
+ bool cc_clip_l = false ,
669
+ bool cc_clip_g = false ,
670
+ bool cc_t5xxl = false )
652
671
: clip_g_tokenizer(0 ) {
653
672
if (clip_skip <= 0 ) {
654
673
clip_skip = 2 ;
655
674
}
656
- clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " text_encoders.clip_l.transformer.text_model" , OPENAI_CLIP_VIT_L_14, clip_skip, false );
657
- clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " text_encoders.clip_g.transformer.text_model" , OPEN_CLIP_VIT_BIGG_14, clip_skip, false );
658
- t5 = std::make_shared<T5Runner>(backend, tensor_types, " text_encoders.t5xxl.transformer" );
675
+ this ->cc_clip_l = cc_clip_l;
676
+ this ->cc_clip_g = cc_clip_g;
677
+ this ->cc_t5xxl = cc_t5xxl;
678
+ clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, clip_l_prefix (), OPENAI_CLIP_VIT_L_14, clip_skip, false );
679
+ clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, clip_g_prefix (), OPEN_CLIP_VIT_BIGG_14, clip_skip, false );
680
+ t5 = std::make_shared<T5Runner>(backend, tensor_types, t5xxl_prefix ());
681
+ }
682
+
683
+ std::string clip_l_prefix () {
684
+ return cc_clip_l ? " cond_stage_model.transformer.text_model" : " text_encoders.clip_l.transformer.text_model" ;
685
+ }
686
+
687
+ std::string clip_g_prefix () {
688
+ return cc_clip_g ? " cond_stage_model.1.transformer.text_model" : " text_encoders.clip_g.transformer.text_model" ;
689
+ }
690
+
691
+ std::string t5xxl_prefix () {
692
+ return cc_t5xxl ? " cond_stage_model.2.transformer" : " text_encoders.t5xxl.transformer" ;
659
693
}
660
694
661
695
void set_clip_skip (int clip_skip) {
@@ -664,9 +698,9 @@ struct SD3CLIPEmbedder : public Conditioner {
664
698
}
665
699
666
700
void get_param_tensors (std::map<std::string, struct ggml_tensor *>& tensors) {
667
- clip_l->get_param_tensors (tensors, " text_encoders.clip_l.transformer.text_model " );
668
- clip_g->get_param_tensors (tensors, " text_encoders.clip_g.transformer.text_model " );
669
- t5->get_param_tensors (tensors, " text_encoders.t5xxl.transformer " );
701
+ clip_l->get_param_tensors (tensors, clip_l_prefix () );
702
+ clip_g->get_param_tensors (tensors, clip_g_prefix () );
703
+ t5->get_param_tensors (tensors, t5xxl_prefix () );
670
704
}
671
705
672
706
void alloc_params_buffer () {
@@ -985,28 +1019,41 @@ struct SD3CLIPEmbedder : public Conditioner {
985
1019
};
986
1020
987
1021
struct FluxCLIPEmbedder : public Conditioner {
1022
+ bool cc_clip_l, cc_t5xxl;
988
1023
CLIPTokenizer clip_l_tokenizer;
989
1024
T5UniGramTokenizer t5_tokenizer;
990
1025
std::shared_ptr<CLIPTextModelRunner> clip_l;
991
1026
std::shared_ptr<T5Runner> t5;
992
1027
993
1028
FluxCLIPEmbedder (ggml_backend_t backend,
994
1029
std::map<std::string, enum ggml_type>& tensor_types,
995
- int clip_skip = -1 ) {
1030
+ int clip_skip = -1 ,
1031
+ bool cc_clip_l = false ,
1032
+ bool cc_t5xxl = false ) {
996
1033
if (clip_skip <= 0 ) {
997
1034
clip_skip = 2 ;
998
1035
}
999
- clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " text_encoders.clip_l.transformer.text_model" , OPENAI_CLIP_VIT_L_14, clip_skip, true );
1000
- t5 = std::make_shared<T5Runner>(backend, tensor_types, " text_encoders.t5xxl.transformer" );
1036
+ this ->cc_clip_l = cc_clip_l;
1037
+ this ->cc_t5xxl = cc_t5xxl;
1038
+ clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, clip_l_prefix (), OPENAI_CLIP_VIT_L_14, clip_skip, true );
1039
+ t5 = std::make_shared<T5Runner>(backend, tensor_types, t5xxl_prefix ());
1040
+ }
1041
+
1042
+ std::string clip_l_prefix () {
1043
+ return cc_clip_l ? " cond_stage_model.transformer.text_model" : " text_encoders.clip_l.transformer.text_model" ;
1044
+ }
1045
+
1046
+ std::string t5xxl_prefix () {
1047
+ return cc_t5xxl ? " cond_stage_model.1.transformer" : " text_encoders.t5xxl.transformer" ;
1001
1048
}
1002
1049
1003
1050
void set_clip_skip (int clip_skip) {
1004
1051
clip_l->set_clip_skip (clip_skip);
1005
1052
}
1006
1053
1007
1054
void get_param_tensors (std::map<std::string, struct ggml_tensor *>& tensors) {
1008
- clip_l->get_param_tensors (tensors, " text_encoders.clip_l.transformer.text_model " );
1009
- t5->get_param_tensors (tensors, " text_encoders.t5xxl.transformer " );
1055
+ clip_l->get_param_tensors (tensors, clip_l_prefix () );
1056
+ t5->get_param_tensors (tensors, t5xxl_prefix () );
1010
1057
}
1011
1058
1012
1059
void alloc_params_buffer () {
0 commit comments