refactor: remove img2vid

thxCode · thxCode · commit 02d35becb776 · 2024-11-08T13:04:30.000+08:00
Signed-off-by: thxCode &lt;thxcode0824@gmail.com&gt;
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
@@ -54,14 +54,12 @@ const char* schedule_str[] = {
 const char* modes_str[] = {
     "txt2img",
     "img2img",
-    "img2vid",
     "convert",
 };
 
 enum SDMode {
     TXT2IMG,
     IMG2IMG,
-    IMG2VID,
     CONVERT,
     MODE_COUNT
 };
@@ -98,11 +96,6 @@ struct SDParams {
     int height        = 512;
     int batch_count   = 1;
 
-    int video_frames         = 6;
-    int motion_bucket_id     = 127;
-    int fps                  = 6;
-    float augmentation_level = 0.f;
-
     sample_method_t sample_method = EULER_A;
     schedule_t schedule           = DEFAULT;
     int sample_steps              = 20;
@@ -247,7 +240,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
             }
             if (mode_found == -1) {
                 fprintf(stderr,
-                        "error: invalid mode %s, must be one of [txt2img, img2img, img2vid, convert]\n",
+                        "error: invalid mode %s, must be one of [txt2img, img2img, convert]\n",
                         mode_selected);
                 exit(1);
             }
@@ -549,7 +542,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         params.n_threads = get_num_physical_cores();
     }
 
-    if (params.mode != CONVERT && params.mode != IMG2VID && params.prompt.length() == 0) {
+    if (params.mode != CONVERT && params.prompt.length() == 0) {
         fprintf(stderr, "error: the following arguments are required: prompt\n");
         print_usage(argc, argv);
         exit(1);
@@ -561,7 +554,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         exit(1);
     }
 
-    if ((params.mode == IMG2IMG || params.mode == IMG2VID) && params.input_path.length() == 0) {
+    if (params.mode == IMG2IMG && params.input_path.length() == 0) {
         fprintf(stderr, "error: when using the img2img mode, the following arguments are required: init-img\n");
         print_usage(argc, argv);
         exit(1);
@@ -711,15 +704,10 @@ int main(int argc, const char* argv[]) {
         }
     }
 
-    if (params.mode == IMG2VID) {
-        fprintf(stderr, "SVD support is broken, do not use it!!!\n");
-        return 1;
-    }
-
     bool vae_decode_only          = true;
     uint8_t* input_image_buffer   = NULL;
     uint8_t* control_image_buffer = NULL;
-    if (params.mode == IMG2IMG || params.mode == IMG2VID) {
+    if (params.mode == IMG2IMG) {
         vae_decode_only = false;
 
         int c              = 0;
@@ -847,63 +835,25 @@ int main(int argc, const char* argv[]) {
                                   3,
                                   input_image_buffer};
 
-        if (params.mode == IMG2VID) {
-            results = img2vid(sd_ctx,
-                              input_image,
-                              params.width,
-                              params.height,
-                              params.video_frames,
-                              params.motion_bucket_id,
-                              params.fps,
-                              params.augmentation_level,
-                              params.min_cfg,
-                              params.cfg_scale,
-                              params.sample_method,
-                              params.sample_steps,
-                              params.strength,
-                              params.seed);
-            if (results == NULL) {
-                printf("generate failed\n");
-                free_sd_ctx(sd_ctx);
-                return 1;
-            }
-            size_t last            = params.output_path.find_last_of(".");
-            std::string dummy_name = last != std::string::npos ? params.output_path.substr(0, last) : params.output_path;
-            for (int i = 0; i < params.video_frames; i++) {
-                if (results[i].data == NULL) {
-                    continue;
-                }
-                std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ".png" : dummy_name + ".png";
-                stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
-                               results[i].data, 0, get_image_params(params, params.seed + i).c_str());
-                printf("save result image to '%s'\n", final_image_path.c_str());
-                free(results[i].data);
-                results[i].data = NULL;
-            }
-            free(results);
-            free_sd_ctx(sd_ctx);
-            return 0;
-        } else {
-            results = img2img(sd_ctx,
-                              input_image,
-                              params.prompt.c_str(),
-                              params.negative_prompt.c_str(),
-                              params.clip_skip,
-                              params.cfg_scale,
-                              params.guidance,
-                              params.width,
-                              params.height,
-                              params.sample_method,
-                              params.sample_steps,
-                              params.strength,
-                              params.seed,
-                              params.batch_count,
-                              control_image,
-                              params.control_strength,
-                              params.style_ratio,
-                              params.normalize_input,
-                              params.input_id_images_path.c_str());
-        }
+        results = img2img(sd_ctx,
+                          input_image,
+                          params.prompt.c_str(),
+                          params.negative_prompt.c_str(),
+                          params.clip_skip,
+                          params.cfg_scale,
+                          params.guidance,
+                          params.width,
+                          params.height,
+                          params.sample_method,
+                          params.sample_steps,
+                          params.strength,
+                          params.seed,
+                          params.batch_count,
+                          control_image,
+                          params.control_strength,
+                          params.style_ratio,
+                          params.normalize_input,
+                          params.input_id_images_path.c_str());
     }
 
     if (results == NULL) {
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -1557,135 +1557,3 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
 
     return result_images;
 }
-
-SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
-                           sd_image_t init_image,
-                           int width,
-                           int height,
-                           int video_frames,
-                           int motion_bucket_id,
-                           int fps,
-                           float augmentation_level,
-                           float min_cfg,
-                           float cfg_scale,
-                           enum sample_method_t sample_method,
-                           int sample_steps,
-                           float strength,
-                           int64_t seed) {
-    if (sd_ctx == NULL) {
-        return NULL;
-    }
-
-    LOG_INFO("img2vid %dx%d", width, height);
-
-    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps);
-
-    struct ggml_init_params params;
-    params.mem_size = static_cast<size_t>(10 * 1024) * 1024;  // 10 MB
-    params.mem_size += width * height * 3 * sizeof(float) * video_frames;
-    params.mem_buffer = NULL;
-    params.no_alloc   = false;
-    // LOG_DEBUG("mem_size %u ", params.mem_size);
-
-    // draft context
-    struct ggml_context* work_ctx = ggml_init(params);
-    if (!work_ctx) {
-        LOG_ERROR("ggml_init() failed");
-        return NULL;
-    }
-
-    if (seed < 0) {
-        seed = (int)time(NULL);
-    }
-
-    sd_ctx->sd->rng->manual_seed(seed);
-
-    int64_t t0 = ggml_time_ms();
-
-    SDCondition cond = sd_ctx->sd->get_svd_condition(work_ctx,
-                                                     init_image,
-                                                     width,
-                                                     height,
-                                                     fps,
-                                                     motion_bucket_id,
-                                                     augmentation_level);
-
-    auto uc_crossattn = ggml_dup_tensor(work_ctx, cond.c_crossattn);
-    ggml_set_f32(uc_crossattn, 0.f);
-
-    auto uc_concat = ggml_dup_tensor(work_ctx, cond.c_concat);
-    ggml_set_f32(uc_concat, 0.f);
-
-    auto uc_vector = ggml_dup_tensor(work_ctx, cond.c_vector);
-
-    SDCondition uncond = SDCondition(uc_crossattn, uc_vector, uc_concat);
-
-    int64_t t1 = ggml_time_ms();
-    LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t1 - t0);
-    if (sd_ctx->sd->free_params_immediately) {
-        sd_ctx->sd->clip_vision->free_params_buffer();
-    }
-
-    sd_ctx->sd->rng->manual_seed(seed);
-    int C                   = 4;
-    int W                   = width / 8;
-    int H                   = height / 8;
-    struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, video_frames);
-    ggml_set_f32(x_t, 0.f);
-
-    struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, video_frames);
-    ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng);
-
-    LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
-    struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx,
-                                                 x_t,
-                                                 noise,
-                                                 cond,
-                                                 uncond,
-                                                 {},
-                                                 0.f,
-                                                 min_cfg,
-                                                 cfg_scale,
-                                                 0.f,
-                                                 sample_method,
-                                                 sigmas,
-                                                 -1,
-                                                 SDCondition(NULL, NULL, NULL));
-
-    int64_t t2 = ggml_time_ms();
-    LOG_INFO("sampling completed, taking %.2fs", (t2 - t1) * 1.0f / 1000);
-    if (sd_ctx->sd->free_params_immediately) {
-        sd_ctx->sd->diffusion_model->free_params_buffer();
-    }
-
-    struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, x_0);
-    if (sd_ctx->sd->free_params_immediately) {
-        sd_ctx->sd->first_stage_model->free_params_buffer();
-    }
-    if (img == NULL) {
-        ggml_free(work_ctx);
-        return NULL;
-    }
-
-    sd_image_t* result_images = (sd_image_t*)calloc(video_frames, sizeof(sd_image_t));
-    if (result_images == NULL) {
-        ggml_free(work_ctx);
-        return NULL;
-    }
-
-    for (size_t i = 0; i < video_frames; i++) {
-        auto img_i = ggml_view_3d(work_ctx, img, img->ne[0], img->ne[1], img->ne[2], img->nb[1], img->nb[2], img->nb[3] * i);
-
-        result_images[i].width   = width;
-        result_images[i].height  = height;
-        result_images[i].channel = 3;
-        result_images[i].data    = sd_tensor_to_image(img_i);
-    }
-    ggml_free(work_ctx);
-
-    int64_t t3 = ggml_time_ms();
-
-    LOG_INFO("img2vid completed in %.2fs", (t3 - t0) * 1.0f / 1000);
-
-    return result_images;
-}
diff --git a/stable-diffusion.h b/stable-diffusion.h
@@ -184,21 +184,6 @@ SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
                            bool normalize_input,
                            const char* input_id_images_path);
 
-SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
-                           sd_image_t init_image,
-                           int width,
-                           int height,
-                           int video_frames,
-                           int motion_bucket_id,
-                           int fps,
-                           float augmentation_level,
-                           float min_cfg,
-                           float cfg_scale,
-                           enum sample_method_t sample_method,
-                           int sample_steps,
-                           float strength,
-                           int64_t seed);
-
 typedef struct upscaler_ctx_t upscaler_ctx_t;
 
 SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,