Skip to content

Commit 02d35be

Browse files
committed
refactor: remove img2vid
Signed-off-by: thxCode <thxcode0824@gmail.com>
1 parent 353e2e1 commit 02d35be

File tree

3 files changed

+23
-220
lines changed

3 files changed

+23
-220
lines changed

examples/cli/main.cpp

Lines changed: 23 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -54,14 +54,12 @@ const char* schedule_str[] = {
5454
const char* modes_str[] = {
5555
"txt2img",
5656
"img2img",
57-
"img2vid",
5857
"convert",
5958
};
6059

6160
enum SDMode {
6261
TXT2IMG,
6362
IMG2IMG,
64-
IMG2VID,
6563
CONVERT,
6664
MODE_COUNT
6765
};
@@ -98,11 +96,6 @@ struct SDParams {
9896
int height = 512;
9997
int batch_count = 1;
10098

101-
int video_frames = 6;
102-
int motion_bucket_id = 127;
103-
int fps = 6;
104-
float augmentation_level = 0.f;
105-
10699
sample_method_t sample_method = EULER_A;
107100
schedule_t schedule = DEFAULT;
108101
int sample_steps = 20;
@@ -247,7 +240,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
247240
}
248241
if (mode_found == -1) {
249242
fprintf(stderr,
250-
"error: invalid mode %s, must be one of [txt2img, img2img, img2vid, convert]\n",
243+
"error: invalid mode %s, must be one of [txt2img, img2img, convert]\n",
251244
mode_selected);
252245
exit(1);
253246
}
@@ -549,7 +542,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
549542
params.n_threads = get_num_physical_cores();
550543
}
551544

552-
if (params.mode != CONVERT && params.mode != IMG2VID && params.prompt.length() == 0) {
545+
if (params.mode != CONVERT && params.prompt.length() == 0) {
553546
fprintf(stderr, "error: the following arguments are required: prompt\n");
554547
print_usage(argc, argv);
555548
exit(1);
@@ -561,7 +554,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
561554
exit(1);
562555
}
563556

564-
if ((params.mode == IMG2IMG || params.mode == IMG2VID) && params.input_path.length() == 0) {
557+
if (params.mode == IMG2IMG && params.input_path.length() == 0) {
565558
fprintf(stderr, "error: when using the img2img mode, the following arguments are required: init-img\n");
566559
print_usage(argc, argv);
567560
exit(1);
@@ -711,15 +704,10 @@ int main(int argc, const char* argv[]) {
711704
}
712705
}
713706

714-
if (params.mode == IMG2VID) {
715-
fprintf(stderr, "SVD support is broken, do not use it!!!\n");
716-
return 1;
717-
}
718-
719707
bool vae_decode_only = true;
720708
uint8_t* input_image_buffer = NULL;
721709
uint8_t* control_image_buffer = NULL;
722-
if (params.mode == IMG2IMG || params.mode == IMG2VID) {
710+
if (params.mode == IMG2IMG) {
723711
vae_decode_only = false;
724712

725713
int c = 0;
@@ -847,63 +835,25 @@ int main(int argc, const char* argv[]) {
847835
3,
848836
input_image_buffer};
849837

850-
if (params.mode == IMG2VID) {
851-
results = img2vid(sd_ctx,
852-
input_image,
853-
params.width,
854-
params.height,
855-
params.video_frames,
856-
params.motion_bucket_id,
857-
params.fps,
858-
params.augmentation_level,
859-
params.min_cfg,
860-
params.cfg_scale,
861-
params.sample_method,
862-
params.sample_steps,
863-
params.strength,
864-
params.seed);
865-
if (results == NULL) {
866-
printf("generate failed\n");
867-
free_sd_ctx(sd_ctx);
868-
return 1;
869-
}
870-
size_t last = params.output_path.find_last_of(".");
871-
std::string dummy_name = last != std::string::npos ? params.output_path.substr(0, last) : params.output_path;
872-
for (int i = 0; i < params.video_frames; i++) {
873-
if (results[i].data == NULL) {
874-
continue;
875-
}
876-
std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ".png" : dummy_name + ".png";
877-
stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
878-
results[i].data, 0, get_image_params(params, params.seed + i).c_str());
879-
printf("save result image to '%s'\n", final_image_path.c_str());
880-
free(results[i].data);
881-
results[i].data = NULL;
882-
}
883-
free(results);
884-
free_sd_ctx(sd_ctx);
885-
return 0;
886-
} else {
887-
results = img2img(sd_ctx,
888-
input_image,
889-
params.prompt.c_str(),
890-
params.negative_prompt.c_str(),
891-
params.clip_skip,
892-
params.cfg_scale,
893-
params.guidance,
894-
params.width,
895-
params.height,
896-
params.sample_method,
897-
params.sample_steps,
898-
params.strength,
899-
params.seed,
900-
params.batch_count,
901-
control_image,
902-
params.control_strength,
903-
params.style_ratio,
904-
params.normalize_input,
905-
params.input_id_images_path.c_str());
906-
}
838+
results = img2img(sd_ctx,
839+
input_image,
840+
params.prompt.c_str(),
841+
params.negative_prompt.c_str(),
842+
params.clip_skip,
843+
params.cfg_scale,
844+
params.guidance,
845+
params.width,
846+
params.height,
847+
params.sample_method,
848+
params.sample_steps,
849+
params.strength,
850+
params.seed,
851+
params.batch_count,
852+
control_image,
853+
params.control_strength,
854+
params.style_ratio,
855+
params.normalize_input,
856+
params.input_id_images_path.c_str());
907857
}
908858

909859
if (results == NULL) {

stable-diffusion.cpp

Lines changed: 0 additions & 132 deletions
Original file line numberDiff line numberDiff line change
@@ -1557,135 +1557,3 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
15571557

15581558
return result_images;
15591559
}
1560-
1561-
SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
1562-
sd_image_t init_image,
1563-
int width,
1564-
int height,
1565-
int video_frames,
1566-
int motion_bucket_id,
1567-
int fps,
1568-
float augmentation_level,
1569-
float min_cfg,
1570-
float cfg_scale,
1571-
enum sample_method_t sample_method,
1572-
int sample_steps,
1573-
float strength,
1574-
int64_t seed) {
1575-
if (sd_ctx == NULL) {
1576-
return NULL;
1577-
}
1578-
1579-
LOG_INFO("img2vid %dx%d", width, height);
1580-
1581-
std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps);
1582-
1583-
struct ggml_init_params params;
1584-
params.mem_size = static_cast<size_t>(10 * 1024) * 1024; // 10 MB
1585-
params.mem_size += width * height * 3 * sizeof(float) * video_frames;
1586-
params.mem_buffer = NULL;
1587-
params.no_alloc = false;
1588-
// LOG_DEBUG("mem_size %u ", params.mem_size);
1589-
1590-
// draft context
1591-
struct ggml_context* work_ctx = ggml_init(params);
1592-
if (!work_ctx) {
1593-
LOG_ERROR("ggml_init() failed");
1594-
return NULL;
1595-
}
1596-
1597-
if (seed < 0) {
1598-
seed = (int)time(NULL);
1599-
}
1600-
1601-
sd_ctx->sd->rng->manual_seed(seed);
1602-
1603-
int64_t t0 = ggml_time_ms();
1604-
1605-
SDCondition cond = sd_ctx->sd->get_svd_condition(work_ctx,
1606-
init_image,
1607-
width,
1608-
height,
1609-
fps,
1610-
motion_bucket_id,
1611-
augmentation_level);
1612-
1613-
auto uc_crossattn = ggml_dup_tensor(work_ctx, cond.c_crossattn);
1614-
ggml_set_f32(uc_crossattn, 0.f);
1615-
1616-
auto uc_concat = ggml_dup_tensor(work_ctx, cond.c_concat);
1617-
ggml_set_f32(uc_concat, 0.f);
1618-
1619-
auto uc_vector = ggml_dup_tensor(work_ctx, cond.c_vector);
1620-
1621-
SDCondition uncond = SDCondition(uc_crossattn, uc_vector, uc_concat);
1622-
1623-
int64_t t1 = ggml_time_ms();
1624-
LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t1 - t0);
1625-
if (sd_ctx->sd->free_params_immediately) {
1626-
sd_ctx->sd->clip_vision->free_params_buffer();
1627-
}
1628-
1629-
sd_ctx->sd->rng->manual_seed(seed);
1630-
int C = 4;
1631-
int W = width / 8;
1632-
int H = height / 8;
1633-
struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, video_frames);
1634-
ggml_set_f32(x_t, 0.f);
1635-
1636-
struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, video_frames);
1637-
ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng);
1638-
1639-
LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
1640-
struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx,
1641-
x_t,
1642-
noise,
1643-
cond,
1644-
uncond,
1645-
{},
1646-
0.f,
1647-
min_cfg,
1648-
cfg_scale,
1649-
0.f,
1650-
sample_method,
1651-
sigmas,
1652-
-1,
1653-
SDCondition(NULL, NULL, NULL));
1654-
1655-
int64_t t2 = ggml_time_ms();
1656-
LOG_INFO("sampling completed, taking %.2fs", (t2 - t1) * 1.0f / 1000);
1657-
if (sd_ctx->sd->free_params_immediately) {
1658-
sd_ctx->sd->diffusion_model->free_params_buffer();
1659-
}
1660-
1661-
struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, x_0);
1662-
if (sd_ctx->sd->free_params_immediately) {
1663-
sd_ctx->sd->first_stage_model->free_params_buffer();
1664-
}
1665-
if (img == NULL) {
1666-
ggml_free(work_ctx);
1667-
return NULL;
1668-
}
1669-
1670-
sd_image_t* result_images = (sd_image_t*)calloc(video_frames, sizeof(sd_image_t));
1671-
if (result_images == NULL) {
1672-
ggml_free(work_ctx);
1673-
return NULL;
1674-
}
1675-
1676-
for (size_t i = 0; i < video_frames; i++) {
1677-
auto img_i = ggml_view_3d(work_ctx, img, img->ne[0], img->ne[1], img->ne[2], img->nb[1], img->nb[2], img->nb[3] * i);
1678-
1679-
result_images[i].width = width;
1680-
result_images[i].height = height;
1681-
result_images[i].channel = 3;
1682-
result_images[i].data = sd_tensor_to_image(img_i);
1683-
}
1684-
ggml_free(work_ctx);
1685-
1686-
int64_t t3 = ggml_time_ms();
1687-
1688-
LOG_INFO("img2vid completed in %.2fs", (t3 - t0) * 1.0f / 1000);
1689-
1690-
return result_images;
1691-
}

stable-diffusion.h

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -184,21 +184,6 @@ SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
184184
bool normalize_input,
185185
const char* input_id_images_path);
186186

187-
SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
188-
sd_image_t init_image,
189-
int width,
190-
int height,
191-
int video_frames,
192-
int motion_bucket_id,
193-
int fps,
194-
float augmentation_level,
195-
float min_cfg,
196-
float cfg_scale,
197-
enum sample_method_t sample_method,
198-
int sample_steps,
199-
float strength,
200-
int64_t seed);
201-
202187
typedef struct upscaler_ctx_t upscaler_ctx_t;
203188

204189
SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,

0 commit comments

Comments
 (0)