Do padding of audio embed in model for humo for more flexibility. (#9935)

2025-09-18 16:54:16 -07:00
parent 1ea8c54064
commit 24b0fce099
2 changed files with 3 additions and 4 deletions
--- a/comfy/ldm/wan/model.py
+++ b/comfy/ldm/wan/model.py
@@ -1551,6 +1551,9 @@ class HumoWanModel(WanModel):
        context_img_len = None

        if audio_embed is not None:
+            if reference_latent is not None:
+                zero_audio_pad = torch.zeros(audio_embed.shape[0], reference_latent.shape[-3], *audio_embed.shape[2:], device=audio_embed.device, dtype=audio_embed.dtype)
+                audio_embed = torch.cat([audio_embed, zero_audio_pad], dim=1)
            audio = self.audio_proj(audio_embed).permute(0, 3, 1, 2).flatten(2).transpose(1, 2)
        else:
            audio = None