Basic Hunyuan Video model support.

2024-12-16 19:35:40 -05:00
parent 19ee5d9d8b
commit bda1482a27
18 changed files with 413646 additions and 76 deletions
--- a/comfy/text_encoders/hunyuan_video.py
+++ b/comfy/text_encoders/hunyuan_video.py
@@ -0,0 +1,98 @@
+from comfy import sd1_clip
+import comfy.model_management
+import comfy.text_encoders.llama
+from transformers import LlamaTokenizerFast
+import torch
+import os
+
+
+class LLAMA3Tokenizer(sd1_clip.SDTokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}, min_length=256):
+        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "llama_tokenizer")
+        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='llama', tokenizer_class=LlamaTokenizerFast, has_start_token=True, has_end_token=True, pad_to_max_length=False, max_length=99999999, pad_token=128258, end_token=128009, min_length=min_length)
+
+class LLAMAModel(sd1_clip.SDClipModel):
+    def __init__(self, device="cpu", layer="hidden", layer_idx=-3, dtype=None, attention_mask=True, model_options={}):
+        llama_scaled_fp8 = model_options.get("llama_scaled_fp8", None)
+        if llama_scaled_fp8 is not None:
+            model_options = model_options.copy()
+            model_options["scaled_fp8"] = llama_scaled_fp8
+
+        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"start": 128000, "pad": 128258}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Llama2, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
+
+
+class HunyuanVideoTokenizer:
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        clip_l_tokenizer_class = tokenizer_data.get("clip_l_tokenizer_class", sd1_clip.SDTokenizer)
+        self.clip_l = clip_l_tokenizer_class(embedding_directory=embedding_directory)
+        self.llama_template = """<|start_header_id|>system<|end_header_id|>
+
+Describe the video by detailing the following aspects: 1. The main content and theme of the video.2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects.3. Actions, events, behaviors temporal relationships, physical movement changes of the objects.4. background environment, light, style and atmosphere.5. camera angles, movements, and transitions used in the video:<|eot_id|><|start_header_id|>user<|end_header_id|>"""  # 93 tokens
+        self.llama = LLAMA3Tokenizer(embedding_directory=embedding_directory, min_length=1)
+
+    def tokenize_with_weights(self, text:str, return_word_ids=False):
+        out = {}
+        out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids)
+
+        llama_text = "{}{}".format(self.llama_template, text)
+        out["llama"] = self.llama.tokenize_with_weights(llama_text, return_word_ids)
+        return out
+
+    def untokenize(self, token_weight_pair):
+        return self.clip_l.untokenize(token_weight_pair)
+
+    def state_dict(self):
+        return {}
+
+
+class HunyuanVideoClipModel(torch.nn.Module):
+    def __init__(self, dtype_llama=None, device="cpu", dtype=None, model_options={}):
+        super().__init__()
+        dtype_llama = comfy.model_management.pick_weight_dtype(dtype_llama, dtype, device)
+        clip_l_class = model_options.get("clip_l_class", sd1_clip.SDClipModel)
+        self.clip_l = clip_l_class(device=device, dtype=dtype, return_projected_pooled=False, model_options=model_options)
+        self.llama = LLAMAModel(device=device, dtype=dtype_llama, model_options=model_options)
+        self.dtypes = set([dtype, dtype_llama])
+
+    def set_clip_options(self, options):
+        self.clip_l.set_clip_options(options)
+        self.llama.set_clip_options(options)
+
+    def reset_clip_options(self):
+        self.clip_l.reset_clip_options()
+        self.llama.reset_clip_options()
+
+    def encode_token_weights(self, token_weight_pairs):
+        token_weight_pairs_l = token_weight_pairs["l"]
+        token_weight_pairs_llama = token_weight_pairs["llama"]
+
+        llama_out, llama_pooled, llama_extra_out = self.llama.encode_token_weights(token_weight_pairs_llama)
+
+        template_end = 0
+        for i, v in enumerate(token_weight_pairs_llama[0]):
+            if v[0] == 128007:  # <|end_header_id|>
+                template_end = i
+
+        llama_out = llama_out[:, template_end:]
+        llama_extra_out["attention_mask"] = llama_extra_out["attention_mask"][:, template_end:]
+        if llama_extra_out["attention_mask"].sum() == torch.numel(llama_extra_out["attention_mask"]):
+            llama_extra_out.pop("attention_mask")  # attention mask is useless if no masked elements
+
+        l_out, l_pooled = self.clip_l.encode_token_weights(token_weight_pairs_l)
+        return llama_out, l_pooled, llama_extra_out
+
+    def load_sd(self, sd):
+        if "text_model.encoder.layers.1.mlp.fc1.weight" in sd:
+            return self.clip_l.load_sd(sd)
+        else:
+            return self.llama.load_sd(sd)
+
+
+def hunyuan_video_clip(dtype_llama=None, llama_scaled_fp8=None):
+    class HunyuanVideoClipModel_(HunyuanVideoClipModel):
+        def __init__(self, device="cpu", dtype=None, model_options={}):
+            if llama_scaled_fp8 is not None and "llama_scaled_fp8" not in model_options:
+                model_options = model_options.copy()
+                model_options["llama_scaled_fp8"] = llama_scaled_fp8
+            super().__init__(dtype_llama=dtype_llama, device=device, dtype=dtype, model_options=model_options)
+    return HunyuanVideoClipModel_
--- a/comfy/text_encoders/llama.py
+++ b/comfy/text_encoders/llama.py
@@ -0,0 +1,221 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from dataclasses import dataclass
+from typing import Optional, Any
+
+from comfy.ldm.modules.attention import optimized_attention
+import comfy.model_management
+import comfy.ldm.common_dit
+
+import comfy.model_management
+
+@dataclass
+class Llama2Config:
+    vocab_size: int = 128320
+    hidden_size: int = 4096
+    intermediate_size: int = 14336
+    num_hidden_layers: int = 32
+    num_attention_heads: int = 32
+    num_key_value_heads: int = 8
+    max_position_embeddings: int = 8192
+    rms_norm_eps: float = 1e-5
+    rope_theta: float = 500000.0
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5, device=None, dtype=None):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.empty(dim, device=device, dtype=dtype))
+
+    def forward(self, x: torch.Tensor):
+        return comfy.ldm.common_dit.rms_norm(x, self.weight, self.eps)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def precompute_freqs_cis(head_dim, seq_len, theta, device=None):
+    theta_numerator = torch.arange(0, head_dim, 2, device=device).float()
+    inv_freq = 1.0 / (theta ** (theta_numerator / head_dim))
+
+    position_ids = torch.arange(0, seq_len, device=device).unsqueeze(0)
+
+    inv_freq_expanded = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+    position_ids_expanded = position_ids[:, None, :].float()
+    freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+    emb = torch.cat((freqs, freqs), dim=-1)
+    cos = emb.cos()
+    sin = emb.sin()
+    return (cos, sin)
+
+
+def apply_rope(xq, xk, freqs_cis):
+    cos = freqs_cis[0].unsqueeze(1)
+    sin = freqs_cis[1].unsqueeze(1)
+    q_embed = (xq * cos) + (rotate_half(xq) * sin)
+    k_embed = (xk * cos) + (rotate_half(xk) * sin)
+    return q_embed, k_embed
+
+
+class Attention(nn.Module):
+    def __init__(self, config: Llama2Config, device=None, dtype=None, ops: Any = None):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.num_kv_heads = config.num_key_value_heads
+        self.hidden_size = config.hidden_size
+        self.head_dim = self.hidden_size // self.num_heads
+
+        ops = ops or nn
+        self.q_proj = ops.Linear(config.hidden_size, config.hidden_size, bias=False, device=device, dtype=dtype)
+        self.k_proj = ops.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=False, device=device, dtype=dtype)
+        self.v_proj = ops.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=False, device=device, dtype=dtype)
+        self.o_proj = ops.Linear(config.hidden_size, config.hidden_size, bias=False, device=device, dtype=dtype)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        freqs_cis: Optional[torch.Tensor] = None,
+    ):
+        batch_size, seq_length, _ = hidden_states.shape
+
+        xq = self.q_proj(hidden_states)
+        xk = self.k_proj(hidden_states)
+        xv = self.v_proj(hidden_states)
+
+        xq = xq.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        xk = xk.view(batch_size, seq_length, self.num_kv_heads, self.head_dim).transpose(1, 2)
+        xv = xv.view(batch_size, seq_length, self.num_kv_heads, self.head_dim).transpose(1, 2)
+
+        xq, xk = apply_rope(xq, xk, freqs_cis=freqs_cis)
+
+        xk = xk.repeat_interleave(self.num_heads // self.num_kv_heads, dim=1)
+        xv = xv.repeat_interleave(self.num_heads // self.num_kv_heads, dim=1)
+
+        output = optimized_attention(xq, xk, xv, self.num_heads, mask=attention_mask, skip_reshape=True)
+        return self.o_proj(output)
+
+class MLP(nn.Module):
+    def __init__(self, config: Llama2Config, device=None, dtype=None, ops: Any = None):
+        super().__init__()
+        ops = ops or nn
+        self.gate_proj = ops.Linear(config.hidden_size, config.intermediate_size, bias=False, device=device, dtype=dtype)
+        self.up_proj = ops.Linear(config.hidden_size, config.intermediate_size, bias=False, device=device, dtype=dtype)
+        self.down_proj = ops.Linear(config.intermediate_size, config.hidden_size, bias=False, device=device, dtype=dtype)
+
+    def forward(self, x):
+        return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
+
+class TransformerBlock(nn.Module):
+    def __init__(self, config: Llama2Config, device=None, dtype=None, ops: Any = None):
+        super().__init__()
+        self.self_attn = Attention(config, device=device, dtype=dtype, ops=ops)
+        self.mlp = MLP(config, device=device, dtype=dtype, ops=ops)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, device=device, dtype=dtype)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, device=device, dtype=dtype)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        freqs_cis: Optional[torch.Tensor] = None,
+    ):
+        # Self Attention
+        residual = x
+        x = self.input_layernorm(x)
+        x = self.self_attn(
+            hidden_states=x,
+            attention_mask=attention_mask,
+            freqs_cis=freqs_cis,
+        )
+        x = residual + x
+
+        # MLP
+        residual = x
+        x = self.post_attention_layernorm(x)
+        x = self.mlp(x)
+        x = residual + x
+
+        return x
+
+class Llama2_(nn.Module):
+    def __init__(self, config, device=None, dtype=None, ops=None):
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = ops.Embedding(
+            config.vocab_size,
+            config.hidden_size,
+            device=device,
+            dtype=dtype
+        )
+        self.layers = nn.ModuleList([
+            TransformerBlock(config, device=device, dtype=dtype, ops=ops)
+            for _ in range(config.num_hidden_layers)
+        ])
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, device=device, dtype=dtype)
+        # self.lm_head = ops.Linear(config.hidden_size, config.vocab_size, bias=False, device=device, dtype=dtype)
+
+    def forward(self, x, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
+        x = self.embed_tokens(x, out_dtype=dtype)
+
+        freqs_cis = precompute_freqs_cis(self.config.hidden_size // self.config.num_attention_heads,
+                                         x.shape[1],
+                                         self.config.rope_theta,
+                                         device=x.device)
+
+        mask = None
+        if attention_mask is not None:
+            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
+            mask = mask.masked_fill(mask.to(torch.bool), float("-inf"))
+
+        causal_mask = torch.empty(x.shape[1], x.shape[1], dtype=x.dtype, device=x.device).fill_(float("-inf")).triu_(1)
+        if mask is not None:
+            mask += causal_mask
+        else:
+            mask = causal_mask
+
+        intermediate = None
+        if intermediate_output is not None:
+            if intermediate_output < 0:
+                intermediate_output = len(self.layers) + intermediate_output
+
+        for i, layer in enumerate(self.layers):
+            x = layer(
+                x=x,
+                attention_mask=mask,
+                freqs_cis=freqs_cis,
+            )
+            if i == intermediate_output:
+                intermediate = x.clone()
+
+        x = self.norm(x)
+        if intermediate is not None and final_layer_norm_intermediate:
+            intermediate = self.norm(intermediate)
+
+        return x, intermediate
+
+
+class Llama2(torch.nn.Module):
+    def __init__(self, config_dict, dtype, device, operations):
+        super().__init__()
+        config = Llama2Config(**config_dict)
+        self.num_layers = config.num_hidden_layers
+
+        self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
+        self.dtype = dtype
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, embeddings):
+        self.model.embed_tokens = embeddings
+
+    def forward(self, input_ids, *args, **kwargs):
+        return self.model(input_ids, *args, **kwargs)
--- a/comfy/text_encoders/llama_tokenizer/tokenizer.json
+++ b/comfy/text_encoders/llama_tokenizer/tokenizer.json
--- a/comfy/text_encoders/llama_tokenizer/tokenizer_config.json
+++ b/comfy/text_encoders/llama_tokenizer/tokenizer_config.json