HunyuanVideo 1.5 (#10819)

* init * update * Update model.py * Update model.py * remove print * Fix text encoding * Prevent empty negative prompt Really doesn't work otherwise * fp16 works * I2V * Update model_base.py * Update nodes_hunyuan.py * Better latent rgb factors * Use the correct sigclip output... * Support HunyuanVideo1.5 SR model * whitespaces... * Proper latent channel count * SR model fixes This also still needs timesteps scheduling based on the noise scale, can be used with two samplers too already * vae_refiner: roll the convolution through temporal Work in progress. Roll the convolution through time using 2-latent-frame chunks and a FIFO queue for the convolution seams. * Support HunyuanVideo15 latent resampler * fix * Some cleanup Co-Authored-By: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> * Proper hyvid15 I2V channels Co-Authored-By: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> * Fix TokenRefiner for fp16 Otherwise x.sum has infs, just in case only casting if input is fp16, I don't know if necessary. * Bugfix for the HunyuanVideo15 SR model * vae_refiner: roll the convolution through temporal II Roll the convolution through time using 2-latent-frame chunks and a FIFO queue for the convolution seams. Added support for encoder, lowered to 1 latent frame to save more VRAM, made work for Hunyuan Image 3.0 (as code shared). Fixed names, cleaned up code. * Allow any number of input frames in VAE. * Better VAE encode mem estimation. * Lowvram fix. * Fix hunyuan image 2.1 refiner. * Fix mistake. * Name changes. * Rename. * Whitespace. * Fix. * Fix. --------- Co-authored-by: kijai <40791699+kijai@users.noreply.github.com> Co-authored-by: Rattus <rattus128@gmail.com>
2025-11-20 19:44:43 -08:00
parent 10e90a5757
commit 943b3b615d
15 changed files with 777 additions and 126 deletions
@@ -6,7 +6,6 @@ import comfy.ldm.flux.layers
 import comfy.ldm.modules.diffusionmodules.mmdit
 from comfy.ldm.modules.attention import optimized_attention

-
 from dataclasses import dataclass
 from einops import repeat

@@ -42,6 +41,8 @@ class HunyuanVideoParams:
    guidance_embed: bool
    byt5: bool
    meanflow: bool
+    use_cond_type_embedding: bool
+    vision_in_dim: int


 class SelfAttentionRef(nn.Module):
@@ -157,7 +158,10 @@ class TokenRefiner(nn.Module):
        t = self.t_embedder(timestep_embedding(timesteps, 256, time_factor=1.0).to(x.dtype))
        # m = mask.float().unsqueeze(-1)
        # c = (x.float() * m).sum(dim=1) / m.sum(dim=1) #TODO: the following works when the x.shape is the same length as the tokens but might break otherwise
-        c = x.sum(dim=1) / x.shape[1]
+        if x.dtype == torch.float16:
+            c = x.float().sum(dim=1) / x.shape[1]
+        else:
+            c = x.sum(dim=1) / x.shape[1]

        c = t + self.c_embedder(c.to(x.dtype))
        x = self.input_embedder(x)
@@ -196,11 +200,15 @@ class HunyuanVideo(nn.Module):
    def __init__(self, image_model=None, final_layer=True, dtype=None, device=None, operations=None, **kwargs):
        super().__init__()
        self.dtype = dtype
+        operation_settings = {"operations": operations, "device": device, "dtype": dtype}
+
        params = HunyuanVideoParams(**kwargs)
        self.params = params
        self.patch_size = params.patch_size
        self.in_channels = params.in_channels
        self.out_channels = params.out_channels
+        self.use_cond_type_embedding = params.use_cond_type_embedding
+        self.vision_in_dim = params.vision_in_dim
        if params.hidden_size % params.num_heads != 0:
            raise ValueError(
                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
@@ -266,6 +274,18 @@ class HunyuanVideo(nn.Module):
        if final_layer:
            self.final_layer = LastLayer(self.hidden_size, self.patch_size[-1], self.out_channels, dtype=dtype, device=device, operations=operations)

+        # HunyuanVideo 1.5 specific modules
+        if self.vision_in_dim is not None:
+            from comfy.ldm.wan.model import MLPProj
+            self.vision_in = MLPProj(in_dim=self.vision_in_dim, out_dim=self.hidden_size, operation_settings=operation_settings)
+        else:
+            self.vision_in = None
+        if self.use_cond_type_embedding:
+            # 0: text_encoder feature 1: byt5 feature 2: vision_encoder feature
+            self.cond_type_embedding = nn.Embedding(3, self.hidden_size)
+        else:
+            self.cond_type_embedding = None
+
    def forward_orig(
        self,
        img: Tensor,
@@ -276,6 +296,7 @@ class HunyuanVideo(nn.Module):
        timesteps: Tensor,
        y: Tensor = None,
        txt_byt5=None,
+        clip_fea=None,
        guidance: Tensor = None,
        guiding_frame_index=None,
        ref_latent=None,
@@ -331,12 +352,31 @@ class HunyuanVideo(nn.Module):

        txt = self.txt_in(txt, timesteps, txt_mask, transformer_options=transformer_options)

+        if self.cond_type_embedding is not None:
+            self.cond_type_embedding.to(txt.device)
+            cond_emb = self.cond_type_embedding(torch.zeros_like(txt[:, :, 0], device=txt.device, dtype=torch.long))
+            txt = txt + cond_emb.to(txt.dtype)
+
        if self.byt5_in is not None and txt_byt5 is not None:
            txt_byt5 = self.byt5_in(txt_byt5)
+            if self.cond_type_embedding is not None:
+                cond_emb = self.cond_type_embedding(torch.ones_like(txt_byt5[:, :, 0], device=txt_byt5.device, dtype=torch.long))
+                txt_byt5 = txt_byt5 + cond_emb.to(txt_byt5.dtype)
+                txt = torch.cat((txt_byt5, txt), dim=1) # byt5 first for HunyuanVideo1.5
+            else:
+                txt = torch.cat((txt, txt_byt5), dim=1)
            txt_byt5_ids = torch.zeros((txt_ids.shape[0], txt_byt5.shape[1], txt_ids.shape[-1]), device=txt_ids.device, dtype=txt_ids.dtype)
-            txt = torch.cat((txt, txt_byt5), dim=1)
            txt_ids = torch.cat((txt_ids, txt_byt5_ids), dim=1)

+        if clip_fea is not None:
+            txt_vision_states = self.vision_in(clip_fea)
+            if self.cond_type_embedding is not None:
+                cond_emb = self.cond_type_embedding(2 * torch.ones_like(txt_vision_states[:, :, 0], dtype=torch.long, device=txt_vision_states.device))
+                txt_vision_states = txt_vision_states + cond_emb
+            txt = torch.cat((txt_vision_states.to(txt.dtype), txt), dim=1)
+            extra_txt_ids = torch.zeros((txt_ids.shape[0], txt_vision_states.shape[1], txt_ids.shape[-1]), device=txt_ids.device, dtype=txt_ids.dtype)
+            txt_ids = torch.cat((txt_ids, extra_txt_ids), dim=1)
+
        ids = torch.cat((img_ids, txt_ids), dim=1)
        pe = self.pe_embedder(ids)

@@ -430,14 +470,14 @@ class HunyuanVideo(nn.Module):
        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
        return repeat(img_ids, "h w c -> b (h w) c", b=bs)

-    def forward(self, x, timestep, context, y=None, txt_byt5=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options={}, **kwargs):
+    def forward(self, x, timestep, context, y=None, txt_byt5=None, clip_fea=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options={}, **kwargs):
        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
            self._forward,
            self,
            comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
-        ).execute(x, timestep, context, y, txt_byt5, guidance, attention_mask, guiding_frame_index, ref_latent, disable_time_r, control, transformer_options, **kwargs)
+        ).execute(x, timestep, context, y, txt_byt5, clip_fea, guidance, attention_mask, guiding_frame_index, ref_latent, disable_time_r, control, transformer_options, **kwargs)

-    def _forward(self, x, timestep, context, y=None, txt_byt5=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options={}, **kwargs):
+    def _forward(self, x, timestep, context, y=None, txt_byt5=None, clip_fea=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options={}, **kwargs):
        bs = x.shape[0]
        if len(self.patch_size) == 3:
            img_ids = self.img_ids(x)
@@ -445,5 +485,5 @@ class HunyuanVideo(nn.Module):
        else:
            img_ids = self.img_ids_2d(x)
            txt_ids = torch.zeros((bs, context.shape[1], 2), device=x.device, dtype=x.dtype)
-        out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, txt_byt5, guidance, guiding_frame_index, ref_latent, disable_time_r=disable_time_r, control=control, transformer_options=transformer_options)
+        out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, txt_byt5, clip_fea, guidance, guiding_frame_index, ref_latent, disable_time_r=disable_time_r, control=control, transformer_options=transformer_options)
        return out
@@ -0,0 +1,120 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from comfy.ldm.hunyuan_video.vae_refiner import RMS_norm, ResnetBlock, VideoConv3d
+import model_management, model_patcher
+
+class SRResidualCausalBlock3D(nn.Module):
+    def __init__(self, channels: int):
+        super().__init__()
+        self.block = nn.Sequential(
+            VideoConv3d(channels, channels, kernel_size=3),
+            nn.SiLU(inplace=True),
+            VideoConv3d(channels, channels, kernel_size=3),
+            nn.SiLU(inplace=True),
+            VideoConv3d(channels, channels, kernel_size=3),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x + self.block(x)
+
+class SRModel3DV2(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        hidden_channels: int = 64,
+        num_blocks: int = 6,
+        global_residual: bool = False,
+    ):
+        super().__init__()
+        self.in_conv = VideoConv3d(in_channels, hidden_channels, kernel_size=3)
+        self.blocks = nn.ModuleList([SRResidualCausalBlock3D(hidden_channels) for _ in range(num_blocks)])
+        self.out_conv = VideoConv3d(hidden_channels, out_channels, kernel_size=3)
+        self.global_residual = bool(global_residual)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        residual = x
+        y = self.in_conv(x)
+        for blk in self.blocks:
+            y = blk(y)
+        y = self.out_conv(y)
+        if self.global_residual and (y.shape == residual.shape):
+            y = y + residual
+        return y
+
+
+class Upsampler(nn.Module):
+    def __init__(
+        self,
+        z_channels: int,
+        out_channels: int,
+        block_out_channels: tuple[int, ...],
+        num_res_blocks: int = 2,
+    ):
+        super().__init__()
+        self.num_res_blocks = num_res_blocks
+        self.block_out_channels = block_out_channels
+        self.z_channels = z_channels
+
+        ch = block_out_channels[0]
+        self.conv_in = VideoConv3d(z_channels, ch, kernel_size=3)
+
+        self.up = nn.ModuleList()
+
+        for i, tgt in enumerate(block_out_channels):
+            stage = nn.Module()
+            stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt,
+                                                    out_channels=tgt,
+                                                    temb_channels=0,
+                                                    conv_shortcut=False,
+                                                    conv_op=VideoConv3d, norm_op=RMS_norm)
+                                        for j in range(num_res_blocks + 1)])
+            ch = tgt
+            self.up.append(stage)
+
+        self.norm_out = RMS_norm(ch)
+        self.conv_out = VideoConv3d(ch, out_channels, kernel_size=3)
+
+    def forward(self, z):
+        """
+        Args:
+            z: (B, C, T, H, W)
+            target_shape: (H, W)
+        """
+        # z to block_in
+        repeats = self.block_out_channels[0] // (self.z_channels)
+        x = self.conv_in(z) + z.repeat_interleave(repeats=repeats, dim=1)
+
+        # upsampling
+        for stage in self.up:
+            for blk in stage.block:
+                x = blk(x)
+
+        out = self.conv_out(F.silu(self.norm_out(x)))
+        return out
+
+UPSAMPLERS = {
+    "720p": SRModel3DV2,
+    "1080p": Upsampler,
+}
+
+class HunyuanVideo15SRModel():
+    def __init__(self, model_type, config):
+        self.load_device = model_management.vae_device()
+        offload_device = model_management.vae_offload_device()
+        self.dtype = model_management.vae_dtype(self.load_device)
+        self.model_class = UPSAMPLERS.get(model_type)
+        self.model = self.model_class(**config).eval()
+
+        self.patcher = model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
+
+    def load_sd(self, sd):
+        return self.model.load_state_dict(sd, strict=True)
+
+    def get_sd(self):
+        return self.model.state_dict()
+
+    def resample_latent(self, latent):
+        model_management.load_model_gpu(self.patcher)
+        return self.model(latent.to(self.load_device))
@@ -4,8 +4,40 @@ import torch.nn.functional as F
 from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, AttnBlock, VideoConv3d, Normalize
 import comfy.ops
 import comfy.ldm.models.autoencoder
+import comfy.model_management
 ops = comfy.ops.disable_weight_init

+class NoPadConv3d(nn.Module):
+    def __init__(self, n_channels, out_channels, kernel_size, stride=1, dilation=1, padding=0, **kwargs):
+        super().__init__()
+        self.conv = ops.Conv3d(n_channels, out_channels, kernel_size, stride=stride, dilation=dilation, **kwargs)
+
+    def forward(self, x):
+        return self.conv(x)
+
+
+def conv_carry_causal_3d(xl, op, conv_carry_in=None, conv_carry_out=None):
+
+    x = xl[0]
+    xl.clear()
+
+    if conv_carry_out is not None:
+        to_push = x[:, :, -2:, :, :].clone()
+        conv_carry_out.append(to_push)
+
+    if isinstance(op, NoPadConv3d):
+        if conv_carry_in is None:
+            x = torch.nn.functional.pad(x, (1, 1, 1, 1, 2, 0), mode = 'replicate')
+        else:
+            carry_len = conv_carry_in[0].shape[2]
+            x = torch.cat([conv_carry_in.pop(0), x], dim=2)
+            x = torch.nn.functional.pad(x, (1, 1, 1, 1, 2 - carry_len, 0), mode = 'replicate')
+
+    out = op(x)
+
+    return out
+
+
 class RMS_norm(nn.Module):
    def __init__(self, dim):
        super().__init__()
@@ -14,7 +46,7 @@ class RMS_norm(nn.Module):
        self.gamma = nn.Parameter(torch.empty(shape))

    def forward(self, x):
-        return F.normalize(x, dim=1) * self.scale * self.gamma
+        return F.normalize(x, dim=1) * self.scale * comfy.model_management.cast_to(self.gamma, dtype=x.dtype, device=x.device)

 class DnSmpl(nn.Module):
    def __init__(self, ic, oc, tds=True, refiner_vae=True, op=VideoConv3d):
@@ -27,11 +59,12 @@ class DnSmpl(nn.Module):
        self.tds = tds
        self.gs = fct * ic // oc

-    def forward(self, x):
+    def forward(self, x, conv_carry_in=None, conv_carry_out=None):
        r1 = 2 if self.tds else 1
-        h = self.conv(x)
+        h = conv_carry_causal_3d([x], self.conv, conv_carry_in, conv_carry_out)
+
+        if self.tds and self.refiner_vae and conv_carry_in is None:

-        if self.tds and self.refiner_vae:
            hf = h[:, :, :1, :, :]
            b, c, f, ht, wd = hf.shape
            hf = hf.reshape(b, c, f, ht // 2, 2, wd // 2, 2)
@@ -39,14 +72,7 @@ class DnSmpl(nn.Module):
            hf = hf.reshape(b, 2 * 2 * c, f, ht // 2, wd // 2)
            hf = torch.cat([hf, hf], dim=1)

-            hn = h[:, :, 1:, :, :]
-            b, c, frms, ht, wd = hn.shape
-            nf = frms // r1
-            hn = hn.reshape(b, c, nf, r1, ht // 2, 2, wd // 2, 2)
-            hn = hn.permute(0, 3, 5, 7, 1, 2, 4, 6)
-            hn = hn.reshape(b, r1 * 2 * 2 * c, nf, ht // 2, wd // 2)
-
-            h = torch.cat([hf, hn], dim=2)
+            h = h[:, :, 1:, :, :]

            xf = x[:, :, :1, :, :]
            b, ci, f, ht, wd = xf.shape
@@ -54,34 +80,32 @@ class DnSmpl(nn.Module):
            xf = xf.permute(0, 4, 6, 1, 2, 3, 5)
            xf = xf.reshape(b, 2 * 2 * ci, f, ht // 2, wd // 2)
            B, C, T, H, W = xf.shape
-            xf = xf.view(B, h.shape[1], self.gs // 2, T, H, W).mean(dim=2)
+            xf = xf.view(B, hf.shape[1], self.gs // 2, T, H, W).mean(dim=2)

-            xn = x[:, :, 1:, :, :]
-            b, ci, frms, ht, wd = xn.shape
-            nf = frms // r1
-            xn = xn.reshape(b, ci, nf, r1, ht // 2, 2, wd // 2, 2)
-            xn = xn.permute(0, 3, 5, 7, 1, 2, 4, 6)
-            xn = xn.reshape(b, r1 * 2 * 2 * ci, nf, ht // 2, wd // 2)
-            B, C, T, H, W = xn.shape
-            xn = xn.view(B, h.shape[1], self.gs, T, H, W).mean(dim=2)
-            sc = torch.cat([xf, xn], dim=2)
-        else:
-            b, c, frms, ht, wd = h.shape
+            x = x[:, :, 1:, :, :]

-            nf = frms // r1
-            h = h.reshape(b, c, nf, r1, ht // 2, 2, wd // 2, 2)
-            h = h.permute(0, 3, 5, 7, 1, 2, 4, 6)
-            h = h.reshape(b, r1 * 2 * 2 * c, nf, ht // 2, wd // 2)
+        if h.shape[2] == 0:
+            return hf + xf

-            b, ci, frms, ht, wd = x.shape
-            nf = frms // r1
-            sc = x.reshape(b, ci, nf, r1, ht // 2, 2, wd // 2, 2)
-            sc = sc.permute(0, 3, 5, 7, 1, 2, 4, 6)
-            sc = sc.reshape(b, r1 * 2 * 2 * ci, nf, ht // 2, wd // 2)
-            B, C, T, H, W = sc.shape
-            sc = sc.view(B, h.shape[1], self.gs, T, H, W).mean(dim=2)
+        b, c, frms, ht, wd = h.shape
+        nf = frms // r1
+        h = h.reshape(b, c, nf, r1, ht // 2, 2, wd // 2, 2)
+        h = h.permute(0, 3, 5, 7, 1, 2, 4, 6)
+        h = h.reshape(b, r1 * 2 * 2 * c, nf, ht // 2, wd // 2)

-        return h + sc
+        b, ci, frms, ht, wd = x.shape
+        nf = frms // r1
+        x = x.reshape(b, ci, nf, r1, ht // 2, 2, wd // 2, 2)
+        x = x.permute(0, 3, 5, 7, 1, 2, 4, 6)
+        x = x.reshape(b, r1 * 2 * 2 * ci, nf, ht // 2, wd // 2)
+        B, C, T, H, W = x.shape
+        x = x.view(B, h.shape[1], self.gs, T, H, W).mean(dim=2)
+
+        if self.tds and self.refiner_vae and conv_carry_in is None:
+            h = torch.cat([hf, h], dim=2)
+            x = torch.cat([xf, x], dim=2)
+
+        return h + x


 class UpSmpl(nn.Module):
@@ -94,11 +118,11 @@ class UpSmpl(nn.Module):
        self.tus = tus
        self.rp = fct * oc // ic

-    def forward(self, x):
+    def forward(self, x, conv_carry_in=None, conv_carry_out=None):
        r1 = 2 if self.tus else 1
-        h = self.conv(x)
+        h = conv_carry_causal_3d([x], self.conv, conv_carry_in, conv_carry_out)

-        if self.tus and self.refiner_vae:
+        if self.tus and self.refiner_vae and conv_carry_in is None:
            hf = h[:, :, :1, :, :]
            b, c, f, ht, wd = hf.shape
            nc = c // (2 * 2)
@@ -107,14 +131,7 @@ class UpSmpl(nn.Module):
            hf = hf.reshape(b, nc, f, ht * 2, wd * 2)
            hf = hf[:, : hf.shape[1] // 2]

-            hn = h[:, :, 1:, :, :]
-            b, c, frms, ht, wd = hn.shape
-            nc = c // (r1 * 2 * 2)
-            hn = hn.reshape(b, r1, 2, 2, nc, frms, ht, wd)
-            hn = hn.permute(0, 4, 5, 1, 6, 2, 7, 3)
-            hn = hn.reshape(b, nc, frms * r1, ht * 2, wd * 2)
-
-            h = torch.cat([hf, hn], dim=2)
+            h = h[:, :, 1:, :, :]

            xf = x[:, :, :1, :, :]
            b, ci, f, ht, wd = xf.shape
@@ -125,29 +142,43 @@ class UpSmpl(nn.Module):
            xf = xf.permute(0, 3, 4, 5, 1, 6, 2)
            xf = xf.reshape(b, nc, f, ht * 2, wd * 2)

-            xn = x[:, :, 1:, :, :]
-            xn = xn.repeat_interleave(repeats=self.rp, dim=1)
-            b, c, frms, ht, wd = xn.shape
-            nc = c // (r1 * 2 * 2)
-            xn = xn.reshape(b, r1, 2, 2, nc, frms, ht, wd)
-            xn = xn.permute(0, 4, 5, 1, 6, 2, 7, 3)
-            xn = xn.reshape(b, nc, frms * r1, ht * 2, wd * 2)
-            sc = torch.cat([xf, xn], dim=2)
-        else:
-            b, c, frms, ht, wd = h.shape
-            nc = c // (r1 * 2 * 2)
-            h = h.reshape(b, r1, 2, 2, nc, frms, ht, wd)
-            h = h.permute(0, 4, 5, 1, 6, 2, 7, 3)
-            h = h.reshape(b, nc, frms * r1, ht * 2, wd * 2)
+            x = x[:, :, 1:, :, :]

-            sc = x.repeat_interleave(repeats=self.rp, dim=1)
-            b, c, frms, ht, wd = sc.shape
-            nc = c // (r1 * 2 * 2)
-            sc = sc.reshape(b, r1, 2, 2, nc, frms, ht, wd)
-            sc = sc.permute(0, 4, 5, 1, 6, 2, 7, 3)
-            sc = sc.reshape(b, nc, frms * r1, ht * 2, wd * 2)
+        b, c, frms, ht, wd = h.shape
+        nc = c // (r1 * 2 * 2)
+        h = h.reshape(b, r1, 2, 2, nc, frms, ht, wd)
+        h = h.permute(0, 4, 5, 1, 6, 2, 7, 3)
+        h = h.reshape(b, nc, frms * r1, ht * 2, wd * 2)

-        return h + sc
+        x = x.repeat_interleave(repeats=self.rp, dim=1)
+        b, c, frms, ht, wd = x.shape
+        nc = c // (r1 * 2 * 2)
+        x = x.reshape(b, r1, 2, 2, nc, frms, ht, wd)
+        x = x.permute(0, 4, 5, 1, 6, 2, 7, 3)
+        x = x.reshape(b, nc, frms * r1, ht * 2, wd * 2)
+
+        if self.tus and self.refiner_vae and conv_carry_in is None:
+            h = torch.cat([hf, h], dim=2)
+            x = torch.cat([xf, x], dim=2)
+
+        return h + x
+
+class HunyuanRefinerResnetBlock(ResnetBlock):
+    def __init__(self, in_channels, out_channels, conv_op=NoPadConv3d, norm_op=RMS_norm):
+        super().__init__(in_channels=in_channels, out_channels=out_channels, temb_channels=0, conv_op=conv_op, norm_op=norm_op)
+
+    def forward(self, x, conv_carry_in=None, conv_carry_out=None):
+        h = x
+        h = [ self.swish(self.norm1(x)) ]
+        h = conv_carry_causal_3d(h, self.conv1, conv_carry_in=conv_carry_in, conv_carry_out=conv_carry_out)
+
+        h = [ self.dropout(self.swish(self.norm2(h))) ]
+        h = conv_carry_causal_3d(h, self.conv2, conv_carry_in=conv_carry_in, conv_carry_out=conv_carry_out)
+
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+
+        return x+h

 class Encoder(nn.Module):
    def __init__(self, in_channels, z_channels, block_out_channels, num_res_blocks,
@@ -160,7 +191,7 @@ class Encoder(nn.Module):

        self.refiner_vae = refiner_vae
        if self.refiner_vae:
-            conv_op = VideoConv3d
+            conv_op = NoPadConv3d
            norm_op = RMS_norm
        else:
            conv_op = ops.Conv3d
@@ -175,10 +206,9 @@ class Encoder(nn.Module):

        for i, tgt in enumerate(block_out_channels):
            stage = nn.Module()
-            stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt,
-                                                     out_channels=tgt,
-                                                     temb_channels=0,
-                                                     conv_op=conv_op, norm_op=norm_op)
+            stage.block = nn.ModuleList([HunyuanRefinerResnetBlock(in_channels=ch if j == 0 else tgt,
+                                                                   out_channels=tgt,
+                                                                   conv_op=conv_op, norm_op=norm_op)
                                        for j in range(num_res_blocks)])
            ch = tgt
            if i < depth:
@@ -188,9 +218,9 @@ class Encoder(nn.Module):
            self.down.append(stage)

        self.mid = nn.Module()
-        self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op)
+        self.mid.block_1 = HunyuanRefinerResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
        self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=norm_op)
-        self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op)
+        self.mid.block_2 = HunyuanRefinerResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)

        self.norm_out = norm_op(ch)
        self.conv_out = conv_op(ch, z_channels << 1, 3, 1, 1)
@@ -201,31 +231,50 @@ class Encoder(nn.Module):
        if not self.refiner_vae and x.shape[2] == 1:
            x = x.expand(-1, -1, self.ffactor_temporal, -1, -1)

-        x = self.conv_in(x)
+        if self.refiner_vae:
+            xl = [x[:, :, :1, :, :]]
+            if x.shape[2] > self.ffactor_temporal:
+                xl += torch.split(x[:, :, 1: 1 + ((x.shape[2] - 1) // self.ffactor_temporal) * self.ffactor_temporal, :, :], self.ffactor_temporal * 2, dim=2)
+            x = xl
+        else:
+            x = [x]
+        out = []

-        for stage in self.down:
-            for blk in stage.block:
-                x = blk(x)
-            if hasattr(stage, 'downsample'):
-                x = stage.downsample(x)
+        conv_carry_in = None

-        x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(x)))
+        for i, x1 in enumerate(x):
+            conv_carry_out = []
+            if i == len(x) - 1:
+                conv_carry_out = None
+            x1 = [ x1 ]
+            x1 = conv_carry_causal_3d(x1, self.conv_in, conv_carry_in, conv_carry_out)
+
+            for stage in self.down:
+                for blk in stage.block:
+                    x1 = blk(x1, conv_carry_in, conv_carry_out)
+                if hasattr(stage, 'downsample'):
+                    x1 = stage.downsample(x1, conv_carry_in, conv_carry_out)
+
+            out.append(x1)
+            conv_carry_in = conv_carry_out
+
+        if len(out) > 1:
+            out = torch.cat(out, dim=2)
+        else:
+            out = out[0]
+
+        x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(out)))
+        del out

        b, c, t, h, w = x.shape
        grp = c // (self.z_channels << 1)
        skip = x.view(b, c // grp, grp, t, h, w).mean(2)

-        out = self.conv_out(F.silu(self.norm_out(x))) + skip
+        out = conv_carry_causal_3d([F.silu(self.norm_out(x))], self.conv_out) + skip

        if self.refiner_vae:
            out = self.regul(out)[0]

-            out = torch.cat((out[:, :, :1], out), dim=2)
-            out = out.permute(0, 2, 1, 3, 4)
-            b, f_times_2, c, h, w = out.shape
-            out = out.reshape(b, f_times_2 // 2, 2 * c, h, w)
-            out = out.permute(0, 2, 1, 3, 4).contiguous()
-
        return out

 class Decoder(nn.Module):
@@ -239,7 +288,7 @@ class Decoder(nn.Module):

        self.refiner_vae = refiner_vae
        if self.refiner_vae:
-            conv_op = VideoConv3d
+            conv_op = NoPadConv3d
            norm_op = RMS_norm
        else:
            conv_op = ops.Conv3d
@@ -249,9 +298,9 @@ class Decoder(nn.Module):
        self.conv_in = conv_op(z_channels, ch, kernel_size=3, stride=1, padding=1)

        self.mid = nn.Module()
-        self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op)
+        self.mid.block_1 = HunyuanRefinerResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
        self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=norm_op)
-        self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op)
+        self.mid.block_2 = HunyuanRefinerResnetBlock(in_channels=ch, out_channels=ch,  conv_op=conv_op, norm_op=norm_op)

        self.up = nn.ModuleList()
        depth = (ffactor_spatial >> 1).bit_length()
@@ -259,10 +308,9 @@ class Decoder(nn.Module):

        for i, tgt in enumerate(block_out_channels):
            stage = nn.Module()
-            stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt,
-                                                     out_channels=tgt,
-                                                     temb_channels=0,
-                                                     conv_op=conv_op, norm_op=norm_op)
+            stage.block = nn.ModuleList([HunyuanRefinerResnetBlock(in_channels=ch if j == 0 else tgt,
+                                                                   out_channels=tgt,
+                                                                   conv_op=conv_op, norm_op=norm_op)
                                        for j in range(num_res_blocks + 1)])
            ch = tgt
            if i < depth:
@@ -275,27 +323,41 @@ class Decoder(nn.Module):
        self.conv_out = conv_op(ch, out_channels, 3, stride=1, padding=1)

    def forward(self, z):
-        if self.refiner_vae:
-            z = z.permute(0, 2, 1, 3, 4)
-            b, f, c, h, w = z.shape
-            z = z.reshape(b, f, 2, c // 2, h, w)
-            z = z.permute(0, 1, 2, 3, 4, 5).reshape(b, f * 2, c // 2, h, w)
-            z = z.permute(0, 2, 1, 3, 4)
-            z = z[:, :, 1:]
-
-        x = self.conv_in(z) + z.repeat_interleave(self.block_out_channels[0] // self.z_channels, 1)
+        x = conv_carry_causal_3d([z], self.conv_in) + z.repeat_interleave(self.block_out_channels[0] // self.z_channels, 1)
        x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(x)))

-        for stage in self.up:
-            for blk in stage.block:
-                x = blk(x)
-            if hasattr(stage, 'upsample'):
-                x = stage.upsample(x)
+        if self.refiner_vae:
+            x = torch.split(x, 2, dim=2)
+        else:
+            x = [ x ]
+        out = []

-        out = self.conv_out(F.silu(self.norm_out(x)))
+        conv_carry_in = None
+
+        for i, x1 in enumerate(x):
+            conv_carry_out = []
+            if i == len(x) - 1:
+                conv_carry_out = None
+            for stage in self.up:
+                for blk in stage.block:
+                    x1 = blk(x1, conv_carry_in, conv_carry_out)
+                if hasattr(stage, 'upsample'):
+                    x1 = stage.upsample(x1, conv_carry_in, conv_carry_out)
+
+            x1 = [ F.silu(self.norm_out(x1)) ]
+            x1 = conv_carry_causal_3d(x1, self.conv_out, conv_carry_in, conv_carry_out)
+            out.append(x1)
+            conv_carry_in = conv_carry_out
+        del x
+
+        if len(out) > 1:
+            out = torch.cat(out, dim=2)
+        else:
+            out = out[0]

        if not self.refiner_vae:
            if z.shape[-3] == 1:
                out = out[:, :, -1:]

        return out
+