Initial ACE-Step model implementation. (#7972)
This commit is contained in:
145
comfy/text_encoders/ace.py
Normal file
145
comfy/text_encoders/ace.py
Normal file
@@ -0,0 +1,145 @@
|
||||
from comfy import sd1_clip
|
||||
from .spiece_tokenizer import SPieceTokenizer
|
||||
import comfy.text_encoders.t5
|
||||
import os
|
||||
import re
|
||||
import torch
|
||||
import logging
|
||||
|
||||
from tokenizers import Tokenizer
|
||||
from .ace_text_cleaners import multilingual_cleaners
|
||||
|
||||
SUPPORT_LANGUAGES = {
|
||||
"en": 259, "de": 260, "fr": 262, "es": 284, "it": 285,
|
||||
"pt": 286, "pl": 294, "tr": 295, "ru": 267, "cs": 293,
|
||||
"nl": 297, "ar": 5022, "zh": 5023, "ja": 5412, "hu": 5753,
|
||||
"ko": 6152, "hi": 6680
|
||||
}
|
||||
|
||||
structure_pattern = re.compile(r"\[.*?\]")
|
||||
|
||||
DEFAULT_VOCAB_FILE = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "ace_lyrics_tokenizer"), "vocab.json")
|
||||
|
||||
|
||||
class VoiceBpeTokenizer:
|
||||
def __init__(self, vocab_file=DEFAULT_VOCAB_FILE):
|
||||
self.tokenizer = None
|
||||
if vocab_file is not None:
|
||||
self.tokenizer = Tokenizer.from_file(vocab_file)
|
||||
|
||||
def preprocess_text(self, txt, lang):
|
||||
txt = multilingual_cleaners(txt, lang)
|
||||
return txt
|
||||
|
||||
def encode(self, txt, lang='en'):
|
||||
# lang = lang.split("-")[0] # remove the region
|
||||
# self.check_input_length(txt, lang)
|
||||
txt = self.preprocess_text(txt, lang)
|
||||
lang = "zh-cn" if lang == "zh" else lang
|
||||
txt = f"[{lang}]{txt}"
|
||||
txt = txt.replace(" ", "[SPACE]")
|
||||
return self.tokenizer.encode(txt).ids
|
||||
|
||||
def get_lang(self, line):
|
||||
if line.startswith("[") and line[3:4] == ']':
|
||||
lang = line[1:3].lower()
|
||||
if lang in SUPPORT_LANGUAGES:
|
||||
return lang, line[4:]
|
||||
return "en", line
|
||||
|
||||
def __call__(self, string):
|
||||
lines = string.split("\n")
|
||||
lyric_token_idx = [261]
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
lyric_token_idx += [2]
|
||||
continue
|
||||
|
||||
lang, line = self.get_lang(line)
|
||||
|
||||
if lang not in SUPPORT_LANGUAGES:
|
||||
lang = "en"
|
||||
if "zh" in lang:
|
||||
lang = "zh"
|
||||
if "spa" in lang:
|
||||
lang = "es"
|
||||
|
||||
try:
|
||||
if structure_pattern.match(line):
|
||||
token_idx = self.encode(line, "en")
|
||||
else:
|
||||
token_idx = self.encode(line, lang)
|
||||
lyric_token_idx = lyric_token_idx + token_idx + [2]
|
||||
except Exception as e:
|
||||
logging.warning("tokenize error {} for line {} major_language {}".format(e, line, lang))
|
||||
return {"input_ids": lyric_token_idx}
|
||||
|
||||
@staticmethod
|
||||
def from_pretrained(path, **kwargs):
|
||||
return VoiceBpeTokenizer(path, **kwargs)
|
||||
|
||||
def get_vocab(self):
|
||||
return {}
|
||||
|
||||
|
||||
class UMT5BaseModel(sd1_clip.SDClipModel):
|
||||
def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, model_options={}):
|
||||
textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "umt5_config_base.json")
|
||||
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=True, zero_out_masked=False, model_options=model_options)
|
||||
|
||||
class UMT5BaseTokenizer(sd1_clip.SDTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
tokenizer = tokenizer_data.get("spiece_model", None)
|
||||
super().__init__(tokenizer, pad_with_end=False, embedding_size=768, embedding_key='umt5base', tokenizer_class=SPieceTokenizer, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=0, tokenizer_data=tokenizer_data)
|
||||
|
||||
def state_dict(self):
|
||||
return {"spiece_model": self.tokenizer.serialize_model()}
|
||||
|
||||
class LyricsTokenizer(sd1_clip.SDTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
tokenizer = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "ace_lyrics_tokenizer"), "vocab.json")
|
||||
super().__init__(tokenizer, pad_with_end=False, embedding_size=1024, embedding_key='lyrics', tokenizer_class=VoiceBpeTokenizer, has_start_token=True, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=2, has_end_token=False, tokenizer_data=tokenizer_data)
|
||||
|
||||
class AceT5Tokenizer:
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
self.voicebpe = LyricsTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
|
||||
self.umt5base = UMT5BaseTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
|
||||
|
||||
def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
|
||||
out = {}
|
||||
out["lyrics"] = self.voicebpe.tokenize_with_weights(kwargs.get("lyrics", ""), return_word_ids, **kwargs)
|
||||
out["umt5base"] = self.umt5base.tokenize_with_weights(text, return_word_ids, **kwargs)
|
||||
return out
|
||||
|
||||
def untokenize(self, token_weight_pair):
|
||||
return self.umt5base.untokenize(token_weight_pair)
|
||||
|
||||
def state_dict(self):
|
||||
return self.umt5base.state_dict()
|
||||
|
||||
class AceT5Model(torch.nn.Module):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}, **kwargs):
|
||||
super().__init__()
|
||||
self.umt5base = UMT5BaseModel(device=device, dtype=dtype, model_options=model_options)
|
||||
self.dtypes = set()
|
||||
if dtype is not None:
|
||||
self.dtypes.add(dtype)
|
||||
|
||||
def set_clip_options(self, options):
|
||||
self.umt5base.set_clip_options(options)
|
||||
|
||||
def reset_clip_options(self):
|
||||
self.umt5base.reset_clip_options()
|
||||
|
||||
def encode_token_weights(self, token_weight_pairs):
|
||||
token_weight_pairs_umt5base = token_weight_pairs["umt5base"]
|
||||
token_weight_pairs_lyrics = token_weight_pairs["lyrics"]
|
||||
|
||||
t5_out, t5_pooled = self.umt5base.encode_token_weights(token_weight_pairs_umt5base)
|
||||
|
||||
lyrics_embeds = torch.tensor(list(map(lambda a: a[0], token_weight_pairs_lyrics[0]))).unsqueeze(0)
|
||||
return t5_out, None, {"conditioning_lyrics": lyrics_embeds}
|
||||
|
||||
def load_sd(self, sd):
|
||||
return self.umt5base.load_sd(sd)
|
||||
15535
comfy/text_encoders/ace_lyrics_tokenizer/vocab.json
Normal file
15535
comfy/text_encoders/ace_lyrics_tokenizer/vocab.json
Normal file
File diff suppressed because it is too large
Load Diff
270
comfy/text_encoders/ace_text_cleaners.py
Normal file
270
comfy/text_encoders/ace_text_cleaners.py
Normal file
@@ -0,0 +1,270 @@
|
||||
# basic text cleaners for the ACE step model
|
||||
# I didn't copy the ones from the reference code because I didn't want to deal with the dependencies
|
||||
# TODO: more languages than english?
|
||||
|
||||
import re
|
||||
|
||||
def number_to_text(num, ordinal=False):
|
||||
"""
|
||||
Convert a number (int or float) to its text representation.
|
||||
|
||||
Args:
|
||||
num: The number to convert
|
||||
|
||||
Returns:
|
||||
str: Text representation of the number
|
||||
"""
|
||||
|
||||
if not isinstance(num, (int, float)):
|
||||
return "Input must be a number"
|
||||
|
||||
# Handle special case of zero
|
||||
if num == 0:
|
||||
return "zero"
|
||||
|
||||
# Handle negative numbers
|
||||
negative = num < 0
|
||||
num = abs(num)
|
||||
|
||||
# Handle floats
|
||||
if isinstance(num, float):
|
||||
# Split into integer and decimal parts
|
||||
int_part = int(num)
|
||||
|
||||
# Convert both parts
|
||||
int_text = _int_to_text(int_part)
|
||||
|
||||
# Handle decimal part (convert to string and remove '0.')
|
||||
decimal_str = str(num).split('.')[1]
|
||||
decimal_text = " point " + " ".join(_digit_to_text(int(digit)) for digit in decimal_str)
|
||||
|
||||
result = int_text + decimal_text
|
||||
else:
|
||||
# Handle integers
|
||||
result = _int_to_text(num)
|
||||
|
||||
# Add 'negative' prefix for negative numbers
|
||||
if negative:
|
||||
result = "negative " + result
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _int_to_text(num):
|
||||
"""Helper function to convert an integer to text"""
|
||||
|
||||
ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
|
||||
"ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen",
|
||||
"seventeen", "eighteen", "nineteen"]
|
||||
|
||||
tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
|
||||
|
||||
if num < 20:
|
||||
return ones[num]
|
||||
|
||||
if num < 100:
|
||||
return tens[num // 10] + (" " + ones[num % 10] if num % 10 != 0 else "")
|
||||
|
||||
if num < 1000:
|
||||
return ones[num // 100] + " hundred" + (" " + _int_to_text(num % 100) if num % 100 != 0 else "")
|
||||
|
||||
if num < 1000000:
|
||||
return _int_to_text(num // 1000) + " thousand" + (" " + _int_to_text(num % 1000) if num % 1000 != 0 else "")
|
||||
|
||||
if num < 1000000000:
|
||||
return _int_to_text(num // 1000000) + " million" + (" " + _int_to_text(num % 1000000) if num % 1000000 != 0 else "")
|
||||
|
||||
return _int_to_text(num // 1000000000) + " billion" + (" " + _int_to_text(num % 1000000000) if num % 1000000000 != 0 else "")
|
||||
|
||||
|
||||
def _digit_to_text(digit):
|
||||
"""Convert a single digit to text"""
|
||||
digits = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
|
||||
return digits[digit]
|
||||
|
||||
|
||||
_whitespace_re = re.compile(r"\s+")
|
||||
|
||||
|
||||
# List of (regular expression, replacement) pairs for abbreviations:
|
||||
_abbreviations = {
|
||||
"en": [
|
||||
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
|
||||
for x in [
|
||||
("mrs", "misess"),
|
||||
("mr", "mister"),
|
||||
("dr", "doctor"),
|
||||
("st", "saint"),
|
||||
("co", "company"),
|
||||
("jr", "junior"),
|
||||
("maj", "major"),
|
||||
("gen", "general"),
|
||||
("drs", "doctors"),
|
||||
("rev", "reverend"),
|
||||
("lt", "lieutenant"),
|
||||
("hon", "honorable"),
|
||||
("sgt", "sergeant"),
|
||||
("capt", "captain"),
|
||||
("esq", "esquire"),
|
||||
("ltd", "limited"),
|
||||
("col", "colonel"),
|
||||
("ft", "fort"),
|
||||
]
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def expand_abbreviations_multilingual(text, lang="en"):
|
||||
for regex, replacement in _abbreviations[lang]:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
_symbols_multilingual = {
|
||||
"en": [
|
||||
(re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
|
||||
for x in [
|
||||
("&", " and "),
|
||||
("@", " at "),
|
||||
("%", " percent "),
|
||||
("#", " hash "),
|
||||
("$", " dollar "),
|
||||
("£", " pound "),
|
||||
("°", " degree "),
|
||||
]
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def expand_symbols_multilingual(text, lang="en"):
|
||||
for regex, replacement in _symbols_multilingual[lang]:
|
||||
text = re.sub(regex, replacement, text)
|
||||
text = text.replace(" ", " ") # Ensure there are no double spaces
|
||||
return text.strip()
|
||||
|
||||
|
||||
_ordinal_re = {
|
||||
"en": re.compile(r"([0-9]+)(st|nd|rd|th)"),
|
||||
}
|
||||
_number_re = re.compile(r"[0-9]+")
|
||||
_currency_re = {
|
||||
"USD": re.compile(r"((\$[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+\$))"),
|
||||
"GBP": re.compile(r"((£[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+£))"),
|
||||
"EUR": re.compile(r"(([0-9\.\,]*[0-9]+€)|((€[0-9\.\,]*[0-9]+)))"),
|
||||
}
|
||||
|
||||
_comma_number_re = re.compile(r"\b\d{1,3}(,\d{3})*(\.\d+)?\b")
|
||||
_dot_number_re = re.compile(r"\b\d{1,3}(.\d{3})*(\,\d+)?\b")
|
||||
_decimal_number_re = re.compile(r"([0-9]+[.,][0-9]+)")
|
||||
|
||||
|
||||
def _remove_commas(m):
|
||||
text = m.group(0)
|
||||
if "," in text:
|
||||
text = text.replace(",", "")
|
||||
return text
|
||||
|
||||
|
||||
def _remove_dots(m):
|
||||
text = m.group(0)
|
||||
if "." in text:
|
||||
text = text.replace(".", "")
|
||||
return text
|
||||
|
||||
|
||||
def _expand_decimal_point(m, lang="en"):
|
||||
amount = m.group(1).replace(",", ".")
|
||||
return number_to_text(float(amount))
|
||||
|
||||
|
||||
def _expand_currency(m, lang="en", currency="USD"):
|
||||
amount = float((re.sub(r"[^\d.]", "", m.group(0).replace(",", "."))))
|
||||
full_amount = number_to_text(amount)
|
||||
|
||||
and_equivalents = {
|
||||
"en": ", ",
|
||||
"es": " con ",
|
||||
"fr": " et ",
|
||||
"de": " und ",
|
||||
"pt": " e ",
|
||||
"it": " e ",
|
||||
"pl": ", ",
|
||||
"cs": ", ",
|
||||
"ru": ", ",
|
||||
"nl": ", ",
|
||||
"ar": ", ",
|
||||
"tr": ", ",
|
||||
"hu": ", ",
|
||||
"ko": ", ",
|
||||
}
|
||||
|
||||
if amount.is_integer():
|
||||
last_and = full_amount.rfind(and_equivalents[lang])
|
||||
if last_and != -1:
|
||||
full_amount = full_amount[:last_and]
|
||||
|
||||
return full_amount
|
||||
|
||||
|
||||
def _expand_ordinal(m, lang="en"):
|
||||
return number_to_text(int(m.group(1)), ordinal=True)
|
||||
|
||||
|
||||
def _expand_number(m, lang="en"):
|
||||
return number_to_text(int(m.group(0)))
|
||||
|
||||
|
||||
def expand_numbers_multilingual(text, lang="en"):
|
||||
if lang in ["en", "ru"]:
|
||||
text = re.sub(_comma_number_re, _remove_commas, text)
|
||||
else:
|
||||
text = re.sub(_dot_number_re, _remove_dots, text)
|
||||
try:
|
||||
text = re.sub(_currency_re["GBP"], lambda m: _expand_currency(m, lang, "GBP"), text)
|
||||
text = re.sub(_currency_re["USD"], lambda m: _expand_currency(m, lang, "USD"), text)
|
||||
text = re.sub(_currency_re["EUR"], lambda m: _expand_currency(m, lang, "EUR"), text)
|
||||
except:
|
||||
pass
|
||||
|
||||
text = re.sub(_decimal_number_re, lambda m: _expand_decimal_point(m, lang), text)
|
||||
text = re.sub(_ordinal_re[lang], lambda m: _expand_ordinal(m, lang), text)
|
||||
text = re.sub(_number_re, lambda m: _expand_number(m, lang), text)
|
||||
return text
|
||||
|
||||
|
||||
def lowercase(text):
|
||||
return text.lower()
|
||||
|
||||
|
||||
def collapse_whitespace(text):
|
||||
return re.sub(_whitespace_re, " ", text)
|
||||
|
||||
|
||||
def multilingual_cleaners(text, lang):
|
||||
text = text.replace('"', "")
|
||||
if lang == "tr":
|
||||
text = text.replace("İ", "i")
|
||||
text = text.replace("Ö", "ö")
|
||||
text = text.replace("Ü", "ü")
|
||||
text = lowercase(text)
|
||||
try:
|
||||
text = expand_numbers_multilingual(text, lang)
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
text = expand_abbreviations_multilingual(text, lang)
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
text = expand_symbols_multilingual(text, lang=lang)
|
||||
except:
|
||||
pass
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
|
||||
def basic_cleaners(text):
|
||||
"""Basic pipeline that lowercases and collapses whitespace without transliteration."""
|
||||
text = lowercase(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
22
comfy/text_encoders/umt5_config_base.json
Normal file
22
comfy/text_encoders/umt5_config_base.json
Normal file
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"d_ff": 2048,
|
||||
"d_kv": 64,
|
||||
"d_model": 768,
|
||||
"decoder_start_token_id": 0,
|
||||
"dropout_rate": 0.1,
|
||||
"eos_token_id": 1,
|
||||
"dense_act_fn": "gelu_pytorch_tanh",
|
||||
"initializer_factor": 1.0,
|
||||
"is_encoder_decoder": true,
|
||||
"is_gated_act": true,
|
||||
"layer_norm_epsilon": 1e-06,
|
||||
"model_type": "umt5",
|
||||
"num_decoder_layers": 12,
|
||||
"num_heads": 12,
|
||||
"num_layers": 12,
|
||||
"output_past": true,
|
||||
"pad_token_id": 0,
|
||||
"relative_attention_num_buckets": 32,
|
||||
"tie_word_embeddings": false,
|
||||
"vocab_size": 256384
|
||||
}
|
||||
Reference in New Issue
Block a user