@@ -51,7 +51,8 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
51
51
52
52
std::string trigger_word = " img" ; // should be user settable
53
53
std::string embd_dir;
54
- int32_t num_custom_embeddings = 0 ;
54
+ int32_t num_custom_embeddings = 0 ;
55
+ int32_t num_custom_embeddings_2 = 0 ;
55
56
std::vector<uint8_t > token_embed_custom;
56
57
std::vector<std::string> readed_embeddings;
57
58
@@ -61,54 +62,54 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
61
62
SDVersion version = VERSION_SD1,
62
63
PMVersion pv = PM_VERSION_1,
63
64
int clip_skip = -1 )
64
- : version(version), pm_version(pv), tokenizer(version == VERSION_SD2 ? 0 : 49407 ), embd_dir(embd_dir) {
65
+ : version(version), pm_version(pv), tokenizer(sd_version_is_sd2( version) ? 0 : 49407 ), embd_dir(embd_dir) {
65
66
if (clip_skip <= 0 ) {
66
67
clip_skip = 1 ;
67
- if (version == VERSION_SD2 || version == VERSION_SDXL ) {
68
+ if (sd_version_is_sd2 ( version) || sd_version_is_sdxl ( version) ) {
68
69
clip_skip = 2 ;
69
70
}
70
71
}
71
- if (version == VERSION_SD1 ) {
72
+ if (sd_version_is_sd1 ( version) ) {
72
73
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.transformer.text_model" , OPENAI_CLIP_VIT_L_14, clip_skip);
73
- } else if (version == VERSION_SD2 ) {
74
+ } else if (sd_version_is_sd2 ( version) ) {
74
75
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.transformer.text_model" , OPEN_CLIP_VIT_H_14, clip_skip);
75
- } else if (version == VERSION_SDXL ) {
76
+ } else if (sd_version_is_sdxl ( version) ) {
76
77
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.transformer.text_model" , OPENAI_CLIP_VIT_L_14, clip_skip, false );
77
78
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, " cond_stage_model.1.transformer.text_model" , OPEN_CLIP_VIT_BIGG_14, clip_skip, false );
78
79
}
79
80
}
80
81
81
82
void set_clip_skip (int clip_skip) {
82
83
text_model->set_clip_skip (clip_skip);
83
- if (version == VERSION_SDXL ) {
84
+ if (sd_version_is_sdxl ( version) ) {
84
85
text_model2->set_clip_skip (clip_skip);
85
86
}
86
87
}
87
88
88
89
void get_param_tensors (std::map<std::string, struct ggml_tensor *>& tensors) {
89
90
text_model->get_param_tensors (tensors, " cond_stage_model.transformer.text_model" );
90
- if (version == VERSION_SDXL ) {
91
+ if (sd_version_is_sdxl ( version) ) {
91
92
text_model2->get_param_tensors (tensors, " cond_stage_model.1.transformer.text_model" );
92
93
}
93
94
}
94
95
95
96
void alloc_params_buffer () {
96
97
text_model->alloc_params_buffer ();
97
- if (version == VERSION_SDXL ) {
98
+ if (sd_version_is_sdxl ( version) ) {
98
99
text_model2->alloc_params_buffer ();
99
100
}
100
101
}
101
102
102
103
void free_params_buffer () {
103
104
text_model->free_params_buffer ();
104
- if (version == VERSION_SDXL ) {
105
+ if (sd_version_is_sdxl ( version) ) {
105
106
text_model2->free_params_buffer ();
106
107
}
107
108
}
108
109
109
110
size_t get_params_buffer_size () {
110
111
size_t buffer_size = text_model->get_params_buffer_size ();
111
- if (version == VERSION_SDXL ) {
112
+ if (sd_version_is_sdxl ( version) ) {
112
113
buffer_size += text_model2->get_params_buffer_size ();
113
114
}
114
115
return buffer_size;
@@ -131,28 +132,55 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
131
132
params.no_alloc = false ;
132
133
struct ggml_context * embd_ctx = ggml_init (params);
133
134
struct ggml_tensor * embd = NULL ;
134
- int64_t hidden_size = text_model-> model . hidden_size ;
135
+ struct ggml_tensor * embd2 = NULL ;
135
136
auto on_load = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) {
136
- if (tensor_storage.ne [0 ] != hidden_size) {
137
- LOG_DEBUG (" embedding wrong hidden size, got %i, expected %i" , tensor_storage.ne [0 ], hidden_size);
138
- return false ;
137
+ if (tensor_storage.ne [0 ] != text_model->model .hidden_size ) {
138
+ if (text_model2) {
139
+ if (tensor_storage.ne [0 ] == text_model2->model .hidden_size ) {
140
+ embd2 = ggml_new_tensor_2d (embd_ctx, tensor_storage.type , text_model2->model .hidden_size , tensor_storage.n_dims > 1 ? tensor_storage.ne [1 ] : 1 );
141
+ *dst_tensor = embd2;
142
+ } else {
143
+ LOG_DEBUG (" embedding wrong hidden size, got %i, expected %i or %i" , tensor_storage.ne [0 ], text_model->model .hidden_size , text_model2->model .hidden_size );
144
+ return false ;
145
+ }
146
+ } else {
147
+ LOG_DEBUG (" embedding wrong hidden size, got %i, expected %i" , tensor_storage.ne [0 ], text_model->model .hidden_size );
148
+ return false ;
149
+ }
150
+ } else {
151
+ embd = ggml_new_tensor_2d (embd_ctx, tensor_storage.type , text_model->model .hidden_size , tensor_storage.n_dims > 1 ? tensor_storage.ne [1 ] : 1 );
152
+ *dst_tensor = embd;
139
153
}
140
- embd = ggml_new_tensor_2d (embd_ctx, tensor_storage.type , hidden_size, tensor_storage.n_dims > 1 ? tensor_storage.ne [1 ] : 1 );
141
- *dst_tensor = embd;
142
154
return true ;
143
155
};
144
156
model_loader.load_tensors (on_load, NULL );
145
157
readed_embeddings.push_back (embd_name);
146
- token_embed_custom.resize (token_embed_custom.size () + ggml_nbytes (embd));
147
- memcpy ((void *)(token_embed_custom.data () + num_custom_embeddings * hidden_size * ggml_type_size (embd->type )),
148
- embd->data ,
149
- ggml_nbytes (embd));
150
- for (int i = 0 ; i < embd->ne [1 ]; i++) {
151
- bpe_tokens.push_back (text_model->model .vocab_size + num_custom_embeddings);
152
- // LOG_DEBUG("new custom token: %i", text_model.vocab_size + num_custom_embeddings);
153
- num_custom_embeddings++;
158
+ if (embd) {
159
+ int64_t hidden_size = text_model->model .hidden_size ;
160
+ token_embed_custom.resize (token_embed_custom.size () + ggml_nbytes (embd));
161
+ memcpy ((void *)(token_embed_custom.data () + num_custom_embeddings * hidden_size * ggml_type_size (embd->type )),
162
+ embd->data ,
163
+ ggml_nbytes (embd));
164
+ for (int i = 0 ; i < embd->ne [1 ]; i++) {
165
+ bpe_tokens.push_back (text_model->model .vocab_size + num_custom_embeddings);
166
+ // LOG_DEBUG("new custom token: %i", text_model.vocab_size + num_custom_embeddings);
167
+ num_custom_embeddings++;
168
+ }
169
+ LOG_DEBUG (" embedding '%s' applied, custom embeddings: %i" , embd_name.c_str (), num_custom_embeddings);
170
+ }
171
+ if (embd2) {
172
+ int64_t hidden_size = text_model2->model .hidden_size ;
173
+ token_embed_custom.resize (token_embed_custom.size () + ggml_nbytes (embd2));
174
+ memcpy ((void *)(token_embed_custom.data () + num_custom_embeddings_2 * hidden_size * ggml_type_size (embd2->type )),
175
+ embd2->data ,
176
+ ggml_nbytes (embd2));
177
+ for (int i = 0 ; i < embd2->ne [1 ]; i++) {
178
+ bpe_tokens.push_back (text_model2->model .vocab_size + num_custom_embeddings_2);
179
+ // LOG_DEBUG("new custom token: %i", text_model.vocab_size + num_custom_embeddings);
180
+ num_custom_embeddings_2++;
181
+ }
182
+ LOG_DEBUG (" embedding '%s' applied, custom embeddings: %i (text model 2)" , embd_name.c_str (), num_custom_embeddings_2);
154
183
}
155
- LOG_DEBUG (" embedding '%s' applied, custom embeddings: %i" , embd_name.c_str (), num_custom_embeddings);
156
184
return true ;
157
185
}
158
186
@@ -402,7 +430,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
402
430
auto input_ids = vector_to_ggml_tensor_i32 (work_ctx, chunk_tokens);
403
431
struct ggml_tensor * input_ids2 = NULL ;
404
432
size_t max_token_idx = 0 ;
405
- if (version == VERSION_SDXL ) {
433
+ if (sd_version_is_sdxl ( version) ) {
406
434
auto it = std::find (chunk_tokens.begin (), chunk_tokens.end (), tokenizer.EOS_TOKEN_ID );
407
435
if (it != chunk_tokens.end ()) {
408
436
std::fill (std::next (it), chunk_tokens.end (), 0 );
@@ -427,7 +455,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
427
455
false ,
428
456
&chunk_hidden_states1,
429
457
work_ctx);
430
- if (version == VERSION_SDXL ) {
458
+ if (sd_version_is_sdxl ( version) ) {
431
459
text_model2->compute (n_threads,
432
460
input_ids2,
433
461
0 ,
@@ -486,7 +514,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
486
514
ggml_nelements (hidden_states) / chunk_hidden_states->ne [0 ]);
487
515
488
516
ggml_tensor* vec = NULL ;
489
- if (version == VERSION_SDXL ) {
517
+ if (sd_version_is_sdxl ( version) ) {
490
518
int out_dim = 256 ;
491
519
vec = ggml_new_tensor_1d (work_ctx, GGML_TYPE_F32, adm_in_channels);
492
520
// [0:1280]
0 commit comments