fix: flux

thxCode · thxCode · commit 99d93eb9a482 · 2024-11-13T19:24:32.000+08:00
Signed-off-by: thxCode &lt;thxcode0824@gmail.com&gt;
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -127,4 +127,3 @@ target_compile_features(${SD_LIB} PUBLIC cxx_std_11)
 if (SD_BUILD_EXAMPLES)
     add_subdirectory(examples)
 endif()
-
diff --git a/conditioner.hpp b/conditioner.hpp
@@ -1019,15 +1019,17 @@ struct SD3CLIPEmbedder : public Conditioner {
 
 struct FluxCLIPEmbedder : public Conditioner {
     ggml_type wtype;
+    bool compvis_compatiblity;
     CLIPTokenizer clip_l_tokenizer;
     T5UniGramTokenizer t5_tokenizer;
     std::shared_ptr<CLIPTextModelRunner> clip_l;
     std::shared_ptr<T5Runner> t5;
 
     FluxCLIPEmbedder(ggml_backend_t backend,
                      ggml_type wtype,
-                     int clip_skip = -1)
-        : wtype(wtype) {
+                     bool compvis_compatiblity = false,
+                     int clip_skip             = -1)
+        : wtype(wtype), compvis_compatiblity(compvis_compatiblity) {
         if (clip_skip <= 0) {
             clip_skip = 2;
         }
@@ -1040,6 +1042,11 @@ struct FluxCLIPEmbedder : public Conditioner {
     }
 
     void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+        if (compvis_compatiblity) {
+            clip_l->get_param_tensors(tensors, "cond_stage_model.transformer.text_model");
+            t5->get_param_tensors(tensors, "cond_stage_model.1.transformer");
+            return;
+        }
         clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model");
         t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
     }
diff --git a/examples/convert/main.cpp b/examples/convert/main.cpp
@@ -239,30 +239,6 @@ int convert_sd3(const convert_params& params, const SDVersion ver) {
     ModelLoader loader;
     bool loaded = false;
 
-    bool ignore_vae = false;
-    if (params.diffusion_model_file_path.empty()) {
-        loaded = loader.init_from_safetensors_file(params.model_path, "transformer/diffusion_pytorch_model", params.output_type, "transformer.");
-    } else {
-        ignore_vae = true;
-        loaded     = loader.init_from_file(params.diffusion_model_file_path);
-    }
-    if (!loaded) {
-        LOG_ERROR("Failed to load transformer model");
-        return 1;
-    }
-
-    if (!ignore_vae || !params.vae_model_file_path.empty()) {
-        if (params.vae_model_file_path.empty()) {
-            loaded = loader.init_from_safetensors_file(params.model_path, "vae/diffusion_pytorch_model", params.vae_output_type, "vae.");
-        } else {
-            loaded = loader.init_from_file(params.vae_model_file_path, "vae.");
-        }
-        if (!loaded) {
-            LOG_ERROR("Failed to load vae model");
-            return 1;
-        }
-    }
-
     if (params.clip_l_model_file_path.empty()) {
         loaded = loader.init_from_safetensors_file(params.model_path, "text_encoder/model", params.clip_output_type, "te.");
     } else {
@@ -293,27 +269,9 @@ int convert_sd3(const convert_params& params, const SDVersion ver) {
         return 1;
     }
 
-    return !loader.save_to_gguf_file(params.output_file_path, params.output_type, params.vae_output_type, params.clip_output_type);
-}
-
-int convert_flux(const convert_params& params, const SDVersion ver) {
-    ModelLoader loader;
-    bool loaded = false;
-
     bool ignore_vae = false;
-    if (params.diffusion_model_file_path.empty()) {
-        if (ver == VERSION_FLUX_DEV) {
-            loaded = loader.init_from_safetensors_file(params.model_path, "flux1-dev", params.output_type, "transformer.");
-        } else {
-            loaded = loader.init_from_safetensors_file(params.model_path, "flux1-schnell", params.output_type, "transformer.");
-        }
-    } else {
+    if (!params.diffusion_model_file_path.empty()) {
         ignore_vae = true;
-        loaded     = loader.init_from_file(params.diffusion_model_file_path);
-    }
-    if (!loaded) {
-        LOG_ERROR("Failed to load transformer model");
-        return 1;
     }
 
     if (!ignore_vae || !params.vae_model_file_path.empty()) {
@@ -328,6 +286,23 @@ int convert_flux(const convert_params& params, const SDVersion ver) {
         }
     }
 
+    if (params.diffusion_model_file_path.empty()) {
+        loaded = loader.init_from_safetensors_file(params.model_path, "transformer/diffusion_pytorch_model", params.output_type, "transformer.");
+    } else {
+        loaded = loader.init_from_file(params.diffusion_model_file_path);
+    }
+    if (!loaded) {
+        LOG_ERROR("Failed to load transformer model");
+        return 1;
+    }
+
+    return !loader.save_to_gguf_file(params.output_file_path, params.output_type, params.vae_output_type, params.clip_output_type);
+}
+
+int convert_flux(const convert_params& params, const SDVersion ver) {
+    ModelLoader loader;
+    bool loaded = false;
+
     if (params.clip_l_model_file_path.empty()) {
         loaded = loader.init_from_safetensors_file(params.model_path, "text_encoder/model", params.clip_output_type, "te.");
     } else {
@@ -348,23 +323,9 @@ int convert_flux(const convert_params& params, const SDVersion ver) {
         return 1;
     }
 
-    return !loader.save_to_gguf_file(params.output_file_path, params.output_type, params.vae_output_type, params.clip_output_type);
-}
-
-int convert_sdxl(const convert_params& params, const SDVersion ver) {
-    ModelLoader loader;
-    bool loaded = false;
-
     bool ignore_vae = false;
-    if (params.diffusion_model_file_path.empty()) {
-        loaded = loader.init_from_safetensors_file(params.model_path, "unet/diffusion_pytorch_model", params.output_type, "unet.");
-    } else {
+    if (!params.diffusion_model_file_path.empty()) {
         ignore_vae = true;
-        loaded     = loader.init_from_file(params.diffusion_model_file_path);
-    }
-    if (!loaded) {
-        LOG_ERROR("Failed to load unet model");
-        return 1;
     }
 
     if (!ignore_vae || !params.vae_model_file_path.empty()) {
@@ -379,6 +340,27 @@ int convert_sdxl(const convert_params& params, const SDVersion ver) {
         }
     }
 
+    if (params.diffusion_model_file_path.empty()) {
+        if (ver == VERSION_FLUX_DEV) {
+            loaded = loader.init_from_safetensors_file(params.model_path, "flux1-dev", params.output_type, "transformer.");
+        } else {
+            loaded = loader.init_from_safetensors_file(params.model_path, "flux1-schnell", params.output_type, "transformer.");
+        }
+    } else {
+        loaded = loader.init_from_file(params.diffusion_model_file_path, "model.diffusion_model.");
+    }
+    if (!loaded) {
+        LOG_ERROR("Failed to load transformer model");
+        return 1;
+    }
+
+    return !loader.save_to_gguf_file(params.output_file_path, params.output_type, params.vae_output_type, params.clip_output_type);
+}
+
+int convert_sdxl(const convert_params& params, const SDVersion ver) {
+    ModelLoader loader;
+    bool loaded = false;
+
     if (params.clip_l_model_file_path.empty()) {
         if (is_directory(path_join(params.model_path, "text_encoder"))) {
             loaded = loader.init_from_safetensors_file(params.model_path, "text_encoder/model", params.clip_output_type, "te.");
@@ -401,25 +383,55 @@ int convert_sdxl(const convert_params& params, const SDVersion ver) {
         return 1;
     }
 
+    bool ignore_vae = false;
+    if (!params.diffusion_model_file_path.empty()) {
+        ignore_vae = true;
+    }
+
+    if (!ignore_vae || !params.vae_model_file_path.empty()) {
+        if (params.vae_model_file_path.empty()) {
+            loaded = loader.init_from_safetensors_file(params.model_path, "vae/diffusion_pytorch_model", params.vae_output_type, "vae.");
+        } else {
+            loaded = loader.init_from_file(params.vae_model_file_path, "vae.");
+        }
+        if (!loaded) {
+            LOG_ERROR("Failed to load vae model");
+            return 1;
+        }
+    }
+
+    if (params.diffusion_model_file_path.empty()) {
+        loaded = loader.init_from_safetensors_file(params.model_path, "unet/diffusion_pytorch_model", params.output_type, "unet.");
+    } else {
+        loaded = loader.init_from_file(params.diffusion_model_file_path);
+    }
+    if (!loaded) {
+        LOG_ERROR("Failed to load unet model");
+        return 1;
+    }
+
     return !loader.save_to_gguf_file(params.output_file_path, params.output_type, params.vae_output_type, params.clip_output_type);
 }
 
 int convert_sd(const convert_params& params, const SDVersion ver) {
     ModelLoader loader;
     bool loaded = false;
 
-    bool ignore_vae = false;
-    if (params.diffusion_model_file_path.empty()) {
-        loaded = loader.init_from_safetensors_file(params.model_path, "unet/diffusion_pytorch_model", params.output_type, "unet.");
+    if (params.clip_l_model_file_path.empty()) {
+        loaded = loader.init_from_safetensors_file(params.model_path, "text_encoder/model", params.clip_output_type, "te.");
     } else {
-        ignore_vae = true;
-        loaded     = loader.init_from_file(params.diffusion_model_file_path);
+        loaded = loader.init_from_file(params.clip_l_model_file_path, "te.");
     }
     if (!loaded) {
-        LOG_ERROR("Failed to load unet model");
+        LOG_ERROR("Failed to load text encoder model");
         return 1;
     }
 
+    bool ignore_vae = false;
+    if (!params.diffusion_model_file_path.empty()) {
+        ignore_vae = true;
+    }
+
     if (!ignore_vae || !params.vae_model_file_path.empty()) {
         if (params.vae_model_file_path.empty()) {
             loaded = loader.init_from_safetensors_file(params.model_path, "vae/diffusion_pytorch_model", params.vae_output_type, "vae.");
@@ -432,13 +444,13 @@ int convert_sd(const convert_params& params, const SDVersion ver) {
         }
     }
 
-    if (params.clip_l_model_file_path.empty()) {
-        loaded = loader.init_from_safetensors_file(params.model_path, "text_encoder/model", params.clip_output_type, "te.");
+    if (params.diffusion_model_file_path.empty()) {
+        loaded = loader.init_from_safetensors_file(params.model_path, "unet/diffusion_pytorch_model", params.output_type, "unet.");
     } else {
-        loaded = loader.init_from_file(params.clip_l_model_file_path, "te.");
+        loaded = loader.init_from_file(params.diffusion_model_file_path);
     }
     if (!loaded) {
-        LOG_ERROR("Failed to load text encoder model");
+        LOG_ERROR("Failed to load unet model");
         return 1;
     }
 
@@ -505,11 +517,13 @@ int main(int argc, char** argv) {
             return 1;
         }
         auto text_encoder_config = load_json(text_encoder_config_path);
-        auto guidance_embeds     = text_encoder_config.at("guidance_embeds").get<bool>();
-        if (guidance_embeds) {
-            ver = VERSION_FLUX_DEV;
+        ver                      = VERSION_FLUX_SCHNELL;
+        if (text_encoder_config.contains("guidance_embeds")) {
+            auto guidance_embeds = text_encoder_config.at("guidance_embeds").get<bool>();
+            if (guidance_embeds) {
+                ver = VERSION_FLUX_DEV;
+            }
         } else {
-            ver = VERSION_FLUX_SCHNELL;
         }
     } else if (class_name == "StableDiffusionXLPipeline" || class_name == "StableDiffusionXLImg2ImgPipeline") {
         ver = VERSION_SDXL;
diff --git a/model.cpp b/model.cpp
@@ -88,6 +88,7 @@ const char* unused_tensors[] = {
     "text_model.embeddings.position_ids",
     "cond_stage_model.transformer.text_model.embeddings.position_ids",
     "cond_stage_model.transformer.text_model.text_projection",
+    "cond_stage_model.1.transformer.encoder.embed_tokens",
     "cond_stage_model.2.transformer.encoder.embed_tokens",
     "cond_stage_model.model.logit_scale",
     "cond_stage_model.model.text_projection",
@@ -327,6 +328,13 @@ std::string convert_diffusers_name_to_compvis(std::string key, char seq) {
         key += format("%c0", seq);
     }
 
+    // return directly
+    if (starts_with(key, format("cond_stage_model%c", seq)) ||
+        starts_with(key, format("first_stage_model%c", seq)) ||
+        starts_with(key, format("model%cdiffusion_model%c", seq, seq))) {
+        return key;
+    }
+
     // unet
     if (match(m, std::regex(format("unet%cconv_in(.*)", seq)), key)) {
         return format("model%cdiffusion_model%cinput_blocks%c0%c0", seq, seq, seq, seq) + m[0];
@@ -472,26 +480,28 @@ std::string convert_diffusers_name_to_compvis(std::string key, char seq) {
         return format("model%cdiffusion_model%ct_embedder%cmlp%c", seq, seq, seq, seq) + std::to_string(std::stoi(m[0]) * 2 - 2) + m[1];
     }
 
-    if (match(m, std::regex(format("transformer_blocks%c(\\d+)%cnorm(\\d+)_context%clinear", seq, seq, seq)), key)) {
+    if (match(m, std::regex(format("transformer%c(\\d+)%cnorm(\\d+)_context%clinear", seq, seq, seq)), key)) {
         return format("model%cdiffusion_model%cjoint_blocks%c%s%ccontext_block%cadaLN_modulation%c%s", seq, seq, seq, m[0].c_str(), seq, seq, seq, m[1].c_str());
     }
 
-    if (match(m, std::regex(format("transformer_blocks%c(\\d+)%cff_context%cnet%c(\\d+)%c", seq, seq, seq, seq)), key)) {
+    if (match(m, std::regex(format("transformer%ctransformer_blocks%c(\\d+)%cff_context%cnet%c(\\d+)%c", seq, seq, seq, seq, seq)), key)) {
         return format("model%cdiffusion_model%cjoint_blocks%c%s%ccontext_block%cmlp%cfc%s", seq, seq, seq, m[0].c_str(), seq, seq, seq, std::to_string(std::stoi(m[1]) / 2 + 1).c_str());
     }
 
-    if (match(m, std::regex(format("transformer_blocks%c(\\d+)%cnorm(\\d+)%clinear", seq, seq, seq)), key)) {
+    if (match(m, std::regex(format("transformer%ctransformer_blocks%c(\\d+)%cnorm(\\d+)%clinear", seq, seq, seq, seq)), key)) {
         return format("model%cdiffusion_model%cjoint_blocks%c%s%cx_block%cadaLN_modulation%c%s", seq, seq, seq, m[0].c_str(), seq, seq, seq, m[1].c_str());
     }
 
-    if (match(m, std::regex(format("transformer_blocks%c(\\d+)%cff%cnet%c(\\d+)%c", seq, seq, seq, seq)), key)) {
+    if (match(m, std::regex(format("transformer%ctransformer_blocks%c(\\d+)%cff%cnet%c(\\d+)%c", seq, seq, seq, seq, seq)), key)) {
         return format("model%cdiffusion_model%cjoint_blocks%c%s%cx_block%cmlp%cfc%s", seq, seq, seq, m[0].c_str(), seq, seq, seq, std::to_string(std::stoi(m[1]) / 2 + 1).c_str());
     }
 
     if (match(m, std::regex(format("transformer%ctransformer_blocks%c(.*)", seq, seq)), key)) {
         return format("model%cdiffusion_model%cjoint_blocks%c", seq, seq, seq) + m[0];
     }
 
+    // TODO: add more transformer conversion
+
     if (match(m, std::regex(format("transformer%c(.*)", seq)), key)) {
         if (m[0] == format("norm_out%clinear", seq)) {
             m[0] = format("final_layer%cadaLN_modulation%c1", seq, seq);
@@ -2004,11 +2014,13 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type outt
     };
 
     bool success = load_tensors(on_new_tensor_cb, backend);
-    ggml_backend_free(backend);
-    LOG_INFO("load tensors done");
-    LOG_INFO("trying to save tensors to %s", file_path.c_str());
     if (success) {
+        LOG_INFO("load tensors done");
+        LOG_INFO("trying to save tensors to %s", file_path.c_str());
         gguf_write_to_file(gguf_ctx, file_path.c_str(), false);
+    } else {
+        LOG_ERROR("load tensors failed");
+        ggml_backend_free(backend);
     }
     ggml_free(ggml_ctx);
     gguf_free(gguf_ctx);
diff --git a/patches/ggml/write.patch b/patches/ggml/write.patch
@@ -0,0 +1,45 @@
+diff --git a/src/ggml.c b/src/ggml.c
+index bc03401..08d1678 100644
+--- a/src/ggml.c
++++ b/src/ggml.c
+@@ -8127,13 +8127,37 @@ void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, boo
+         GGML_ABORT("failed to open file for writing");
+     }
+ 
++    // write meta data
+     struct gguf_buf buf = gguf_buf_init(16*1024);
++    gguf_write_to_buf(ctx, &buf, true);
++    fwrite(buf.data, 1, buf.offset, file);
++    gguf_buf_free(buf);
+ 
+-    gguf_write_to_buf(ctx, &buf, only_meta);
++    if (only_meta) {
++        fclose(file);
++        return;
++    }
+ 
+-    fwrite(buf.data, 1, buf.offset, file);
++    // write tensor data
++    size_t offset = 0;
++    for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
++        struct gguf_tensor_info * info = &ctx->infos[i];
+ 
+-    gguf_buf_free(buf);
++        const size_t size     = info->size;
++        const size_t size_pad = GGML_PAD(size, ctx->alignment);
++
++        fwrite(info->data, 1, size, file);
++
++        if (size_pad != size) {
++            uint8_t pad = 0;
++            for (size_t j = 0; j < size_pad - size; ++j) {
++                fwrite(&pad, 1, sizeof(pad), file);
++            }
++        }
++
++        GGML_ASSERT(offset == info->offset);
++        offset += size_pad;
++    }
+ 
+     fclose(file);
+ }
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -361,7 +361,7 @@ class StableDiffusionGGML {
                 cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend, conditioner_wtype, model_loader.has_prefix_tensors("cond_stage_model."));
                 diffusion_model  = std::make_shared<MMDiTModel>(backend, diffusion_model_wtype, version);
             } else if (version == VERSION_FLUX_DEV || version == VERSION_FLUX_SCHNELL) {
-                cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, conditioner_wtype);
+                cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, conditioner_wtype, model_loader.has_prefix_tensors("cond_stage_model."));
                 diffusion_model  = std::make_shared<FluxModel>(backend, diffusion_model_wtype, version);
             } else {
                 cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, conditioner_wtype, embeddings_path, version);