code · pull · May 29, 2025 · May 28, 2025 · May 29, 2025 · May 29, 2025
diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py
@@ -539,13 +539,20 @@ def block_wrap(args):
         x = self.unpatchify(x, grid_sizes)
         return x
 
-    def forward(self, x, timestep, context, clip_fea=None, transformer_options={}, **kwargs):
+    def forward(self, x, timestep, context, clip_fea=None, time_dim_concat=None, transformer_options={}, **kwargs):
         bs, c, t, h, w = x.shape
         x = comfy.ldm.common_dit.pad_to_patch_size(x, self.patch_size)
+
         patch_size = self.patch_size
         t_len = ((t + (patch_size[0] // 2)) // patch_size[0])
         h_len = ((h + (patch_size[1] // 2)) // patch_size[1])
         w_len = ((w + (patch_size[2] // 2)) // patch_size[2])
+
+        if time_dim_concat is not None:
+            time_dim_concat = comfy.ldm.common_dit.pad_to_patch_size(time_dim_concat, self.patch_size)
+            x = torch.cat([x, time_dim_concat], dim=2)
+            t_len = ((x.shape[2] + (patch_size[0] // 2)) // patch_size[0])
+
         img_ids = torch.zeros((t_len, h_len, w_len, 3), device=x.device, dtype=x.dtype)
         img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(0, t_len - 1, steps=t_len, device=x.device, dtype=x.dtype).reshape(-1, 1, 1)
         img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).reshape(1, -1, 1)

diff --git a/comfy/lora.py b/comfy/lora.py
@@ -283,8 +283,9 @@ def model_lora_keys_unet(model, key_map={}):
         for k in sdk:
             if k.startswith("diffusion_model."):
                 if k.endswith(".weight"):
-                    key_lora = k[len("diffusion_model."):-len(".weight")].replace(".", "_")
-                    key_map["lycoris_{}".format(key_lora)] = k #SimpleTuner lycoris format
+                    key_lora = k[len("diffusion_model."):-len(".weight")]
+                    key_map["lycoris_{}".format(key_lora.replace(".", "_"))] = k #SimpleTuner lycoris format
+                    key_map["transformer.{}".format(key_lora)] = k #SimpleTuner regular format
 
     if isinstance(model, comfy.model_base.ACEStep):
         for k in sdk:

diff --git a/comfy/model_base.py b/comfy/model_base.py
@@ -1057,6 +1057,11 @@ def extra_conds(self, **kwargs):
         clip_vision_output = kwargs.get("clip_vision_output", None)
         if clip_vision_output is not None:
             out['clip_fea'] = comfy.conds.CONDRegular(clip_vision_output.penultimate_hidden_states)
+
+        time_dim_concat = kwargs.get("time_dim_concat", None)
+        if time_dim_concat is not None:
+            out['time_dim_concat'] = comfy.conds.CONDRegular(self.process_latent_in(time_dim_concat))
+
         return out
 
 

diff --git a/comfy_api_nodes/nodes_pika.py b/comfy_api_nodes/nodes_pika.py
@@ -6,40 +6,42 @@
 from __future__ import annotations
 
 import io
-from typing import Optional, TypeVar
 import logging
-import torch
+from typing import Optional, TypeVar
+
 import numpy as np
+import torch
+
+from comfy.comfy_types.node_typing import IO, ComfyNodeABC, InputTypeOptions
+from comfy_api.input_impl import VideoFromFile
+from comfy_api.input_impl.video_types import VideoCodec, VideoContainer, VideoInput
+from comfy_api_nodes.apinode_utils import (
+    download_url_to_video_output,
+    tensor_to_bytesio,
+)
 from comfy_api_nodes.apis import (
-    PikaBodyGenerate22T2vGenerate22T2vPost,
-    PikaGenerateResponse,
-    PikaBodyGenerate22I2vGenerate22I2vPost,
-    PikaVideoResponse,
-    PikaBodyGenerate22C2vGenerate22PikascenesPost,
     IngredientsMode,
-    PikaDurationEnum,
-    PikaResolutionEnum,
-    PikaBodyGeneratePikaffectsGeneratePikaffectsPost,
+    PikaBodyGenerate22C2vGenerate22PikascenesPost,
+    PikaBodyGenerate22I2vGenerate22I2vPost,
+    PikaBodyGenerate22KeyframeGenerate22PikaframesPost,
+    PikaBodyGenerate22T2vGenerate22T2vPost,
     PikaBodyGeneratePikadditionsGeneratePikadditionsPost,
+    PikaBodyGeneratePikaffectsGeneratePikaffectsPost,
     PikaBodyGeneratePikaswapsGeneratePikaswapsPost,
-    PikaBodyGenerate22KeyframeGenerate22PikaframesPost,
+    PikaDurationEnum,
     Pikaffect,
+    PikaGenerateResponse,
+    PikaResolutionEnum,
+    PikaVideoResponse,
 )
 from comfy_api_nodes.apis.client import (
     ApiEndpoint,
+    EmptyRequest,
     HttpMethod,
-    SynchronousOperation,
     PollingOperation,
-    EmptyRequest,
-)
-from comfy_api_nodes.apinode_utils import (
-    tensor_to_bytesio,
-    download_url_to_video_output,
+    SynchronousOperation,
 )
 from comfy_api_nodes.mapper_utils import model_field_to_node_input
-from comfy_api.input_impl.video_types import VideoInput, VideoContainer, VideoCodec
-from comfy_api.input_impl import VideoFromFile
-from comfy.comfy_types.node_typing import IO, ComfyNodeABC, InputTypeOptions
 
 R = TypeVar("R")
 
@@ -204,6 +206,7 @@ def INPUT_TYPES(cls):
             "hidden": {
                 "auth_token": "AUTH_TOKEN_COMFY_ORG",
                 "comfy_api_key": "API_KEY_COMFY_ORG",
+                "unique_id": "UNIQUE_ID",
             },
         }
 
@@ -457,7 +460,7 @@ def INPUT_TYPES(cls):
             },
         }
 
-    DESCRIPTION = "Add any object or image into your video. Upload a video and specify what you’d like to add to create a seamlessly integrated result."
+    DESCRIPTION = "Add any object or image into your video. Upload a video and specify what you'd like to add to create a seamlessly integrated result."
 
     def api_call(
         self,

diff --git a/comfy_api_nodes/nodes_veo2.py b/comfy_api_nodes/nodes_veo2.py
@@ -54,6 +54,10 @@ class VeoVideoGenerationNode(ComfyNodeABC):
     """
     Generates videos from text prompts using Google's Veo API.
 
+    Supported models:
+    - veo-2.0-generate-001
+    - veo-3.0-generate-preview
+
     This node can create videos from text descriptions and optional image inputs,
     with control over parameters like aspect ratio, duration, and more.
     """
@@ -130,6 +134,14 @@ def INPUT_TYPES(s):
                     "default": None,
                     "tooltip": "Optional reference image to guide video generation",
                 }),
+                "model": (
+                    IO.COMBO,
+                    {
+                        "options": ["veo-2.0-generate-001", "veo-3.0-generate-preview"],
+                        "default": "veo-2.0-generate-001",
+                        "tooltip": "Model to use for video generation. Defaults to veo 2.0",
+                    },
+                ),
             },
             "hidden": {
                 "auth_token": "AUTH_TOKEN_COMFY_ORG",
@@ -154,6 +166,7 @@ def generate_video(
         person_generation="ALLOW",
         seed=0,
         image=None,
+        model="veo-2.0-generate-001",
         unique_id: Optional[str] = None,
         **kwargs,
     ):
@@ -192,7 +205,7 @@ def generate_video(
         # Initial request to start video generation
         initial_operation = SynchronousOperation(
             endpoint=ApiEndpoint(
-                path="/proxy/veo/generate",
+                path=f"/proxy/veo/{model}/generate",
                 method=HttpMethod.POST,
                 request_model=Veo2GenVidRequest,
                 response_model=Veo2GenVidResponse
@@ -223,7 +236,7 @@ def progress_extractor(response):
         # Define the polling operation
         poll_operation = PollingOperation(
             poll_endpoint=ApiEndpoint(
-                path="/proxy/veo/poll",
+                path=f"/proxy/veo/{model}/poll",
                 method=HttpMethod.POST,
                 request_model=Veo2GenVidPollRequest,
                 response_model=Veo2GenVidPollResponse
@@ -304,5 +317,5 @@ def progress_extractor(response):
 }
 
 NODE_DISPLAY_NAME_MAPPINGS = {
-    "VeoVideoGenerationNode": "Google Veo2 Video Generation",
+    "VeoVideoGenerationNode": "Google Veo Video Generation",
 }
diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py
@@ -345,6 +345,44 @@ def encode(self, positive, negative, vae, width, height, length, batch_size, sta
         out_latent["samples"] = latent
         return (positive, negative, out_latent)
 
+class WanPhantomSubjectToVideo:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {"positive": ("CONDITIONING", ),
+                             "negative": ("CONDITIONING", ),
+                             "vae": ("VAE", ),
+                             "width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
+                             "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
+                             "length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
+                             "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
+                },
+                "optional": {"images": ("IMAGE", ),
+                }}
+
+    RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "CONDITIONING", "LATENT")
+    RETURN_NAMES = ("positive", "negative_text", "negative_img_text", "latent")
+    FUNCTION = "encode"
+
+    CATEGORY = "conditioning/video_models"
+
+    def encode(self, positive, negative, vae, width, height, length, batch_size, images):
+        latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
+        cond2 = negative
+        if images is not None:
+            images = comfy.utils.common_upscale(images[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
+            latent_images = []
+            for i in images:
+                latent_images += [vae.encode(i.unsqueeze(0)[:, :, :, :3])]
+            concat_latent_image = torch.cat(latent_images, dim=2)
+
+            positive = node_helpers.conditioning_set_values(positive, {"time_dim_concat": concat_latent_image})
+            cond2 = node_helpers.conditioning_set_values(negative, {"time_dim_concat": concat_latent_image})
+            negative = node_helpers.conditioning_set_values(negative, {"time_dim_concat": comfy.latent_formats.Wan21().process_out(torch.zeros_like(concat_latent_image))})
+
+        out_latent = {}
+        out_latent["samples"] = latent
+        return (positive, cond2, negative, out_latent)
+
 NODE_CLASS_MAPPINGS = {
     "WanImageToVideo": WanImageToVideo,
     "WanFunControlToVideo": WanFunControlToVideo,
@@ -353,4 +391,5 @@ def encode(self, positive, negative, vae, width, height, length, batch_size, sta
     "WanVaceToVideo": WanVaceToVideo,
     "TrimVideoLatent": TrimVideoLatent,
     "WanCameraImageToVideo": WanCameraImageToVideo,
+    "WanPhantomSubjectToVideo": WanPhantomSubjectToVideo,
 }