@@ -143,8 +143,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
143
143
params.no_alloc = false ;
144
144
struct ggml_context * embd_ctx = ggml_init (params);
145
145
struct ggml_tensor * embd = NULL ;
146
- int64_t hidden_size = text_model->model .hidden_size ;
147
- auto on_load = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) {
146
+ int64_t hidden_size = 0 ;
147
+ if (version != VERSION_SDXL_REFINER) {
148
+ hidden_size = text_model->model .hidden_size ;
149
+ } else {
150
+ hidden_size = text_model2->model .hidden_size ;
151
+ }
152
+ auto on_load = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) {
148
153
if (tensor_storage.ne [0 ] != hidden_size) {
149
154
LOG_DEBUG (" embedding wrong hidden size, got %i, expected %i" , tensor_storage.ne [0 ], hidden_size);
150
155
return false ;
@@ -160,21 +165,24 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
160
165
embd->data ,
161
166
ggml_nbytes (embd));
162
167
for (int i = 0 ; i < embd->ne [1 ]; i++) {
163
- bpe_tokens.push_back (text_model->model .vocab_size + num_custom_embeddings);
168
+ if (version != VERSION_SDXL_REFINER) {
169
+ bpe_tokens.push_back (text_model->model .vocab_size + num_custom_embeddings);
170
+ } else {
171
+ bpe_tokens.push_back (text_model2->model .vocab_size + num_custom_embeddings);
172
+ }
164
173
// LOG_DEBUG("new custom token: %i", text_model.vocab_size + num_custom_embeddings);
165
174
num_custom_embeddings++;
166
175
}
167
176
LOG_DEBUG (" embedding '%s' applied, custom embeddings: %i" , embd_name.c_str (), num_custom_embeddings);
168
177
return true ;
169
178
}
170
179
171
- std::tuple<std::vector<int >, std::vector<float >, std::vector<bool >>
172
- tokenize_with_trigger_token (std::string text,
173
- int num_input_imgs,
174
- int32_t image_token,
175
- bool padding = false ) {
180
+ std::tuple<std::vector<int >, std::vector<float >, std::vector<bool >> tokenize_with_trigger_token (std::string text,
181
+ int num_input_imgs,
182
+ int32_t image_token,
183
+ bool padding = false ) {
176
184
return tokenize_with_trigger_token (text, num_input_imgs, image_token,
177
- text_model->model .n_token , padding);
185
+ version != VERSION_SDXL_REFINER ? text_model-> model . n_token : text_model2 ->model .n_token , padding);
178
186
}
179
187
180
188
std::vector<int > convert_token_to_id (std::string text) {
@@ -320,7 +328,10 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
320
328
321
329
std::pair<std::vector<int >, std::vector<float >> tokenize (std::string text,
322
330
bool padding = false ) {
323
- return tokenize (text, text_model->model .n_token , padding);
331
+ if (version != VERSION_SDXL_REFINER) {
332
+ return tokenize (text, text_model->model .n_token , padding);
333
+ }
334
+ return tokenize (text, text_model2->model .n_token , padding);
324
335
}
325
336
326
337
std::pair<std::vector<int >, std::vector<float >> tokenize (std::string text,
@@ -446,8 +457,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
446
457
max_token_idx,
447
458
false ,
448
459
&chunk_hidden_states2, work_ctx);
449
- // concat
450
- chunk_hidden_states = ggml_tensor_concat (work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0 );
460
+
461
+ if (version == VERSION_SDXL) {
462
+ // concat
463
+ chunk_hidden_states = ggml_tensor_concat (work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0 );
464
+ } else {
465
+ chunk_hidden_states = chunk_hidden_states2;
466
+ }
451
467
452
468
if (chunk_idx == 0 ) {
453
469
text_model2->compute (n_threads,
@@ -497,7 +513,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
497
513
ggml_nelements (hidden_states) / chunk_hidden_states->ne [0 ]);
498
514
499
515
ggml_tensor* vec = NULL ;
500
- if (version == VERSION_SDXL || version == VERSION_SDXL_REFINER ) {
516
+ if (version == VERSION_SDXL) {
501
517
int out_dim = 256 ;
502
518
vec = ggml_new_tensor_1d (work_ctx, GGML_TYPE_F32, adm_in_channels);
503
519
// [0:1280]
0 commit comments