Skip to content

[pull] master from comfyanonymous:master #86

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 29, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion comfy/ldm/wan/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,13 +539,20 @@ def block_wrap(args):
x = self.unpatchify(x, grid_sizes)
return x

def forward(self, x, timestep, context, clip_fea=None, transformer_options={}, **kwargs):
def forward(self, x, timestep, context, clip_fea=None, time_dim_concat=None, transformer_options={}, **kwargs):
bs, c, t, h, w = x.shape
x = comfy.ldm.common_dit.pad_to_patch_size(x, self.patch_size)

patch_size = self.patch_size
t_len = ((t + (patch_size[0] // 2)) // patch_size[0])
h_len = ((h + (patch_size[1] // 2)) // patch_size[1])
w_len = ((w + (patch_size[2] // 2)) // patch_size[2])

if time_dim_concat is not None:
time_dim_concat = comfy.ldm.common_dit.pad_to_patch_size(time_dim_concat, self.patch_size)
x = torch.cat([x, time_dim_concat], dim=2)
t_len = ((x.shape[2] + (patch_size[0] // 2)) // patch_size[0])

img_ids = torch.zeros((t_len, h_len, w_len, 3), device=x.device, dtype=x.dtype)
img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(0, t_len - 1, steps=t_len, device=x.device, dtype=x.dtype).reshape(-1, 1, 1)
img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).reshape(1, -1, 1)
Expand Down
5 changes: 3 additions & 2 deletions comfy/lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,8 +283,9 @@ def model_lora_keys_unet(model, key_map={}):
for k in sdk:
if k.startswith("diffusion_model."):
if k.endswith(".weight"):
key_lora = k[len("diffusion_model."):-len(".weight")].replace(".", "_")
key_map["lycoris_{}".format(key_lora)] = k #SimpleTuner lycoris format
key_lora = k[len("diffusion_model."):-len(".weight")]
key_map["lycoris_{}".format(key_lora.replace(".", "_"))] = k #SimpleTuner lycoris format
key_map["transformer.{}".format(key_lora)] = k #SimpleTuner regular format

if isinstance(model, comfy.model_base.ACEStep):
for k in sdk:
Expand Down
5 changes: 5 additions & 0 deletions comfy/model_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1057,6 +1057,11 @@ def extra_conds(self, **kwargs):
clip_vision_output = kwargs.get("clip_vision_output", None)
if clip_vision_output is not None:
out['clip_fea'] = comfy.conds.CONDRegular(clip_vision_output.penultimate_hidden_states)

time_dim_concat = kwargs.get("time_dim_concat", None)
if time_dim_concat is not None:
out['time_dim_concat'] = comfy.conds.CONDRegular(self.process_latent_in(time_dim_concat))

return out


Expand Down
45 changes: 24 additions & 21 deletions comfy_api_nodes/nodes_pika.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,40 +6,42 @@
from __future__ import annotations

import io
from typing import Optional, TypeVar
import logging
import torch
from typing import Optional, TypeVar

import numpy as np
import torch

from comfy.comfy_types.node_typing import IO, ComfyNodeABC, InputTypeOptions
from comfy_api.input_impl import VideoFromFile
from comfy_api.input_impl.video_types import VideoCodec, VideoContainer, VideoInput
from comfy_api_nodes.apinode_utils import (
download_url_to_video_output,
tensor_to_bytesio,
)
from comfy_api_nodes.apis import (
PikaBodyGenerate22T2vGenerate22T2vPost,
PikaGenerateResponse,
PikaBodyGenerate22I2vGenerate22I2vPost,
PikaVideoResponse,
PikaBodyGenerate22C2vGenerate22PikascenesPost,
IngredientsMode,
PikaDurationEnum,
PikaResolutionEnum,
PikaBodyGeneratePikaffectsGeneratePikaffectsPost,
PikaBodyGenerate22C2vGenerate22PikascenesPost,
PikaBodyGenerate22I2vGenerate22I2vPost,
PikaBodyGenerate22KeyframeGenerate22PikaframesPost,
PikaBodyGenerate22T2vGenerate22T2vPost,
PikaBodyGeneratePikadditionsGeneratePikadditionsPost,
PikaBodyGeneratePikaffectsGeneratePikaffectsPost,
PikaBodyGeneratePikaswapsGeneratePikaswapsPost,
PikaBodyGenerate22KeyframeGenerate22PikaframesPost,
PikaDurationEnum,
Pikaffect,
PikaGenerateResponse,
PikaResolutionEnum,
PikaVideoResponse,
)
from comfy_api_nodes.apis.client import (
ApiEndpoint,
EmptyRequest,
HttpMethod,
SynchronousOperation,
PollingOperation,
EmptyRequest,
)
from comfy_api_nodes.apinode_utils import (
tensor_to_bytesio,
download_url_to_video_output,
SynchronousOperation,
)
from comfy_api_nodes.mapper_utils import model_field_to_node_input
from comfy_api.input_impl.video_types import VideoInput, VideoContainer, VideoCodec
from comfy_api.input_impl import VideoFromFile
from comfy.comfy_types.node_typing import IO, ComfyNodeABC, InputTypeOptions

R = TypeVar("R")

Expand Down Expand Up @@ -204,6 +206,7 @@ def INPUT_TYPES(cls):
"hidden": {
"auth_token": "AUTH_TOKEN_COMFY_ORG",
"comfy_api_key": "API_KEY_COMFY_ORG",
"unique_id": "UNIQUE_ID",
},
}

Expand Down Expand Up @@ -457,7 +460,7 @@ def INPUT_TYPES(cls):
},
}

DESCRIPTION = "Add any object or image into your video. Upload a video and specify what youd like to add to create a seamlessly integrated result."
DESCRIPTION = "Add any object or image into your video. Upload a video and specify what you'd like to add to create a seamlessly integrated result."

def api_call(
self,
Expand Down
19 changes: 16 additions & 3 deletions comfy_api_nodes/nodes_veo2.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ class VeoVideoGenerationNode(ComfyNodeABC):
"""
Generates videos from text prompts using Google's Veo API.

Supported models:
- veo-2.0-generate-001
- veo-3.0-generate-preview

This node can create videos from text descriptions and optional image inputs,
with control over parameters like aspect ratio, duration, and more.
"""
Expand Down Expand Up @@ -130,6 +134,14 @@ def INPUT_TYPES(s):
"default": None,
"tooltip": "Optional reference image to guide video generation",
}),
"model": (
IO.COMBO,
{
"options": ["veo-2.0-generate-001", "veo-3.0-generate-preview"],
"default": "veo-2.0-generate-001",
"tooltip": "Model to use for video generation. Defaults to veo 2.0",
},
),
},
"hidden": {
"auth_token": "AUTH_TOKEN_COMFY_ORG",
Expand All @@ -154,6 +166,7 @@ def generate_video(
person_generation="ALLOW",
seed=0,
image=None,
model="veo-2.0-generate-001",
unique_id: Optional[str] = None,
**kwargs,
):
Expand Down Expand Up @@ -192,7 +205,7 @@ def generate_video(
# Initial request to start video generation
initial_operation = SynchronousOperation(
endpoint=ApiEndpoint(
path="/proxy/veo/generate",
path=f"/proxy/veo/{model}/generate",
method=HttpMethod.POST,
request_model=Veo2GenVidRequest,
response_model=Veo2GenVidResponse
Expand Down Expand Up @@ -223,7 +236,7 @@ def progress_extractor(response):
# Define the polling operation
poll_operation = PollingOperation(
poll_endpoint=ApiEndpoint(
path="/proxy/veo/poll",
path=f"/proxy/veo/{model}/poll",
method=HttpMethod.POST,
request_model=Veo2GenVidPollRequest,
response_model=Veo2GenVidPollResponse
Expand Down Expand Up @@ -304,5 +317,5 @@ def progress_extractor(response):
}

NODE_DISPLAY_NAME_MAPPINGS = {
"VeoVideoGenerationNode": "Google Veo2 Video Generation",
"VeoVideoGenerationNode": "Google Veo Video Generation",
}
39 changes: 39 additions & 0 deletions comfy_extras/nodes_wan.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,44 @@ def encode(self, positive, negative, vae, width, height, length, batch_size, sta
out_latent["samples"] = latent
return (positive, negative, out_latent)

class WanPhantomSubjectToVideo:
@classmethod
def INPUT_TYPES(s):
return {"required": {"positive": ("CONDITIONING", ),
"negative": ("CONDITIONING", ),
"vae": ("VAE", ),
"width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
"height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
"length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
},
"optional": {"images": ("IMAGE", ),
}}

RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "CONDITIONING", "LATENT")
RETURN_NAMES = ("positive", "negative_text", "negative_img_text", "latent")
FUNCTION = "encode"

CATEGORY = "conditioning/video_models"

def encode(self, positive, negative, vae, width, height, length, batch_size, images):
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
cond2 = negative
if images is not None:
images = comfy.utils.common_upscale(images[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
latent_images = []
for i in images:
latent_images += [vae.encode(i.unsqueeze(0)[:, :, :, :3])]
concat_latent_image = torch.cat(latent_images, dim=2)

positive = node_helpers.conditioning_set_values(positive, {"time_dim_concat": concat_latent_image})
cond2 = node_helpers.conditioning_set_values(negative, {"time_dim_concat": concat_latent_image})
negative = node_helpers.conditioning_set_values(negative, {"time_dim_concat": comfy.latent_formats.Wan21().process_out(torch.zeros_like(concat_latent_image))})

out_latent = {}
out_latent["samples"] = latent
return (positive, cond2, negative, out_latent)

NODE_CLASS_MAPPINGS = {
"WanImageToVideo": WanImageToVideo,
"WanFunControlToVideo": WanFunControlToVideo,
Expand All @@ -353,4 +391,5 @@ def encode(self, positive, negative, vae, width, height, length, batch_size, sta
"WanVaceToVideo": WanVaceToVideo,
"TrimVideoLatent": TrimVideoLatent,
"WanCameraImageToVideo": WanCameraImageToVideo,
"WanPhantomSubjectToVideo": WanPhantomSubjectToVideo,
}
Loading