Release on 09.11.24

2024-11-09 17:19:42 +03:00
commit 3c64fe0c3e
339 changed files with 456101 additions and 0 deletions
--- a/comfy/text_encoders/aura_t5.py
+++ b/comfy/text_encoders/aura_t5.py
@@ -0,0 +1,22 @@
+from comfy import sd1_clip
+from .spiece_tokenizer import SPieceTokenizer
+import comfy.text_encoders.t5
+import os
+
+class PT5XlModel(sd1_clip.SDClipModel):
+    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, model_options={}):
+        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_pile_config_xl.json")
+        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 2, "pad": 1}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=True, zero_out_masked=True, model_options=model_options)
+
+class PT5XlTokenizer(sd1_clip.SDTokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        tokenizer_path = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_pile_tokenizer"), "tokenizer.model")
+        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2048, embedding_key='pile_t5xl', tokenizer_class=SPieceTokenizer, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256, pad_token=1)
+
+class AuraT5Tokenizer(sd1_clip.SD1Tokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="pile_t5xl", tokenizer=PT5XlTokenizer)
+
+class AuraT5Model(sd1_clip.SD1ClipModel):
+    def __init__(self, device="cpu", dtype=None, model_options={}, **kwargs):
+        super().__init__(device=device, dtype=dtype, model_options=model_options, name="pile_t5xl", clip_model=PT5XlModel, **kwargs)
--- a/comfy/text_encoders/bert.py
+++ b/comfy/text_encoders/bert.py
@@ -0,0 +1,140 @@
+import torch
+from comfy.ldm.modules.attention import optimized_attention_for_device
+import comfy.ops
+
+class BertAttention(torch.nn.Module):
+    def __init__(self, embed_dim, heads, dtype, device, operations):
+        super().__init__()
+
+        self.heads = heads
+        self.query = operations.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
+        self.key = operations.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
+        self.value = operations.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
+
+
+    def forward(self, x, mask=None, optimized_attention=None):
+        q = self.query(x)
+        k = self.key(x)
+        v = self.value(x)
+
+        out = optimized_attention(q, k, v, self.heads, mask)
+        return out
+
+class BertOutput(torch.nn.Module):
+    def __init__(self, input_dim, output_dim, layer_norm_eps, dtype, device, operations):
+        super().__init__()
+        self.dense = operations.Linear(input_dim, output_dim, dtype=dtype, device=device)
+        self.LayerNorm = operations.LayerNorm(output_dim, eps=layer_norm_eps, dtype=dtype, device=device)
+        # self.dropout = nn.Dropout(0.0)
+
+    def forward(self, x, y):
+        x = self.dense(x)
+        # hidden_states = self.dropout(hidden_states)
+        x = self.LayerNorm(x + y)
+        return x
+
+class BertAttentionBlock(torch.nn.Module):
+    def __init__(self, embed_dim, heads, layer_norm_eps, dtype, device, operations):
+        super().__init__()
+        self.self = BertAttention(embed_dim, heads, dtype, device, operations)
+        self.output = BertOutput(embed_dim, embed_dim, layer_norm_eps, dtype, device, operations)
+
+    def forward(self, x, mask, optimized_attention):
+        y = self.self(x, mask, optimized_attention)
+        return self.output(y, x)
+
+class BertIntermediate(torch.nn.Module):
+    def __init__(self, embed_dim, intermediate_dim, dtype, device, operations):
+        super().__init__()
+        self.dense = operations.Linear(embed_dim, intermediate_dim, dtype=dtype, device=device)
+
+    def forward(self, x):
+        x = self.dense(x)
+        return torch.nn.functional.gelu(x)
+
+
+class BertBlock(torch.nn.Module):
+    def __init__(self, embed_dim, intermediate_dim, heads, layer_norm_eps, dtype, device, operations):
+        super().__init__()
+        self.attention = BertAttentionBlock(embed_dim, heads, layer_norm_eps, dtype, device, operations)
+        self.intermediate = BertIntermediate(embed_dim, intermediate_dim, dtype, device, operations)
+        self.output = BertOutput(intermediate_dim, embed_dim, layer_norm_eps, dtype, device, operations)
+
+    def forward(self, x, mask, optimized_attention):
+        x = self.attention(x, mask, optimized_attention)
+        y = self.intermediate(x)
+        return self.output(y, x)
+
+class BertEncoder(torch.nn.Module):
+    def __init__(self, num_layers, embed_dim, intermediate_dim, heads, layer_norm_eps, dtype, device, operations):
+        super().__init__()
+        self.layer = torch.nn.ModuleList([BertBlock(embed_dim, intermediate_dim, heads, layer_norm_eps, dtype, device, operations) for i in range(num_layers)])
+
+    def forward(self, x, mask=None, intermediate_output=None):
+        optimized_attention = optimized_attention_for_device(x.device, mask=mask is not None, small_input=True)
+
+        if intermediate_output is not None:
+            if intermediate_output < 0:
+                intermediate_output = len(self.layer) + intermediate_output
+
+        intermediate = None
+        for i, l in enumerate(self.layer):
+            x = l(x, mask, optimized_attention)
+            if i == intermediate_output:
+                intermediate = x.clone()
+        return x, intermediate
+
+class BertEmbeddings(torch.nn.Module):
+    def __init__(self, vocab_size, max_position_embeddings, type_vocab_size, pad_token_id, embed_dim, layer_norm_eps, dtype, device, operations):
+        super().__init__()
+        self.word_embeddings = operations.Embedding(vocab_size, embed_dim, padding_idx=pad_token_id, dtype=dtype, device=device)
+        self.position_embeddings = operations.Embedding(max_position_embeddings, embed_dim, dtype=dtype, device=device)
+        self.token_type_embeddings = operations.Embedding(type_vocab_size, embed_dim, dtype=dtype, device=device)
+
+        self.LayerNorm = operations.LayerNorm(embed_dim, eps=layer_norm_eps, dtype=dtype, device=device)
+
+    def forward(self, input_tokens, token_type_ids=None, dtype=None):
+        x = self.word_embeddings(input_tokens, out_dtype=dtype)
+        x += comfy.ops.cast_to_input(self.position_embeddings.weight[:x.shape[1]], x)
+        if token_type_ids is not None:
+            x += self.token_type_embeddings(token_type_ids, out_dtype=x.dtype)
+        else:
+            x += comfy.ops.cast_to_input(self.token_type_embeddings.weight[0], x)
+        x = self.LayerNorm(x)
+        return x
+
+
+class BertModel_(torch.nn.Module):
+    def __init__(self, config_dict, dtype, device, operations):
+        super().__init__()
+        embed_dim = config_dict["hidden_size"]
+        layer_norm_eps = config_dict["layer_norm_eps"]
+
+        self.embeddings = BertEmbeddings(config_dict["vocab_size"], config_dict["max_position_embeddings"], config_dict["type_vocab_size"], config_dict["pad_token_id"], embed_dim, layer_norm_eps, dtype, device, operations)
+        self.encoder = BertEncoder(config_dict["num_hidden_layers"], embed_dim, config_dict["intermediate_size"], config_dict["num_attention_heads"], layer_norm_eps, dtype, device, operations)
+
+    def forward(self, input_tokens, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
+        x = self.embeddings(input_tokens, dtype=dtype)
+        mask = None
+        if attention_mask is not None:
+            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
+            mask = mask.masked_fill(mask.to(torch.bool), float("-inf"))
+
+        x, i = self.encoder(x, mask, intermediate_output)
+        return x, i
+
+
+class BertModel(torch.nn.Module):
+    def __init__(self, config_dict, dtype, device, operations):
+        super().__init__()
+        self.bert = BertModel_(config_dict, dtype, device, operations)
+        self.num_layers = config_dict["num_hidden_layers"]
+
+    def get_input_embeddings(self):
+        return self.bert.embeddings.word_embeddings
+
+    def set_input_embeddings(self, embeddings):
+        self.bert.embeddings.word_embeddings = embeddings
+
+    def forward(self, *args, **kwargs):
+        return self.bert(*args, **kwargs)
--- a/comfy/text_encoders/flux.py
+++ b/comfy/text_encoders/flux.py
@@ -0,0 +1,72 @@
+from comfy import sd1_clip
+import comfy.text_encoders.t5
+import comfy.text_encoders.sd3_clip
+import comfy.model_management
+from transformers import T5TokenizerFast
+import torch
+import os
+
+class T5XXLTokenizer(sd1_clip.SDTokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_tokenizer")
+        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256)
+
+
+class FluxTokenizer:
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        clip_l_tokenizer_class = tokenizer_data.get("clip_l_tokenizer_class", sd1_clip.SDTokenizer)
+        self.clip_l = clip_l_tokenizer_class(embedding_directory=embedding_directory)
+        self.t5xxl = T5XXLTokenizer(embedding_directory=embedding_directory)
+
+    def tokenize_with_weights(self, text:str, return_word_ids=False):
+        out = {}
+        out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids)
+        out["t5xxl"] = self.t5xxl.tokenize_with_weights(text, return_word_ids)
+        return out
+
+    def untokenize(self, token_weight_pair):
+        return self.clip_l.untokenize(token_weight_pair)
+
+    def state_dict(self):
+        return {}
+
+
+class FluxClipModel(torch.nn.Module):
+    def __init__(self, dtype_t5=None, device="cpu", dtype=None, model_options={}):
+        super().__init__()
+        dtype_t5 = comfy.model_management.pick_weight_dtype(dtype_t5, dtype, device)
+        clip_l_class = model_options.get("clip_l_class", sd1_clip.SDClipModel)
+        self.clip_l = clip_l_class(device=device, dtype=dtype, return_projected_pooled=False, model_options=model_options)
+        self.t5xxl = comfy.text_encoders.sd3_clip.T5XXLModel(device=device, dtype=dtype_t5, model_options=model_options)
+        self.dtypes = set([dtype, dtype_t5])
+
+    def set_clip_options(self, options):
+        self.clip_l.set_clip_options(options)
+        self.t5xxl.set_clip_options(options)
+
+    def reset_clip_options(self):
+        self.clip_l.reset_clip_options()
+        self.t5xxl.reset_clip_options()
+
+    def encode_token_weights(self, token_weight_pairs):
+        token_weight_pairs_l = token_weight_pairs["l"]
+        token_weight_pairs_t5 = token_weight_pairs["t5xxl"]
+
+        t5_out, t5_pooled = self.t5xxl.encode_token_weights(token_weight_pairs_t5)
+        l_out, l_pooled = self.clip_l.encode_token_weights(token_weight_pairs_l)
+        return t5_out, l_pooled
+
+    def load_sd(self, sd):
+        if "text_model.encoder.layers.1.mlp.fc1.weight" in sd:
+            return self.clip_l.load_sd(sd)
+        else:
+            return self.t5xxl.load_sd(sd)
+
+def flux_clip(dtype_t5=None, t5xxl_scaled_fp8=None):
+    class FluxClipModel_(FluxClipModel):
+        def __init__(self, device="cpu", dtype=None, model_options={}):
+            if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options:
+                model_options = model_options.copy()
+                model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8
+            super().__init__(dtype_t5=dtype_t5, device=device, dtype=dtype, model_options=model_options)
+    return FluxClipModel_
--- a/comfy/text_encoders/genmo.py
+++ b/comfy/text_encoders/genmo.py
@@ -0,0 +1,38 @@
+from comfy import sd1_clip
+import comfy.text_encoders.sd3_clip
+import os
+from transformers import T5TokenizerFast
+
+
+class T5XXLModel(comfy.text_encoders.sd3_clip.T5XXLModel):
+    def __init__(self, **kwargs):
+        kwargs["attention_mask"] = True
+        super().__init__(**kwargs)
+
+
+class MochiT5XXL(sd1_clip.SD1ClipModel):
+    def __init__(self, device="cpu", dtype=None, model_options={}):
+        super().__init__(device=device, dtype=dtype, name="t5xxl", clip_model=T5XXLModel, model_options=model_options)
+
+
+class T5XXLTokenizer(sd1_clip.SDTokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_tokenizer")
+        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256)
+
+
+class MochiT5Tokenizer(sd1_clip.SD1Tokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="t5xxl", tokenizer=T5XXLTokenizer)
+
+
+def mochi_te(dtype_t5=None, t5xxl_scaled_fp8=None):
+    class MochiTEModel_(MochiT5XXL):
+        def __init__(self, device="cpu", dtype=None, model_options={}):
+            if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options:
+                model_options = model_options.copy()
+                model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8
+            if dtype is None:
+                dtype = dtype_t5
+            super().__init__(device=device, dtype=dtype, model_options=model_options)
+    return MochiTEModel_
--- a/comfy/text_encoders/hydit.py
+++ b/comfy/text_encoders/hydit.py
@@ -0,0 +1,79 @@
+from comfy import sd1_clip
+from transformers import BertTokenizer
+from .spiece_tokenizer import SPieceTokenizer
+from .bert import BertModel
+import comfy.text_encoders.t5
+import os
+import torch
+
+class HyditBertModel(sd1_clip.SDClipModel):
+    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, model_options={}):
+        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "hydit_clip.json")
+        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"start": 101, "end": 102, "pad": 0}, model_class=BertModel, enable_attention_masks=True, return_attention_masks=True, model_options=model_options)
+
+class HyditBertTokenizer(sd1_clip.SDTokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "hydit_clip_tokenizer")
+        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=1024, embedding_key='chinese_roberta', tokenizer_class=BertTokenizer, pad_to_max_length=False, max_length=512, min_length=77)
+
+
+class MT5XLModel(sd1_clip.SDClipModel):
+    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, model_options={}):
+        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "mt5_config_xl.json")
+        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=True, return_attention_masks=True, model_options=model_options)
+
+class MT5XLTokenizer(sd1_clip.SDTokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        #tokenizer_path = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "mt5_tokenizer"), "spiece.model")
+        tokenizer = tokenizer_data.get("spiece_model", None)
+        super().__init__(tokenizer, pad_with_end=False, embedding_size=2048, embedding_key='mt5xl', tokenizer_class=SPieceTokenizer, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256)
+
+    def state_dict(self):
+        return {"spiece_model": self.tokenizer.serialize_model()}
+
+class HyditTokenizer:
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        mt5_tokenizer_data = tokenizer_data.get("mt5xl.spiece_model", None)
+        self.hydit_clip = HyditBertTokenizer(embedding_directory=embedding_directory)
+        self.mt5xl = MT5XLTokenizer(tokenizer_data={"spiece_model": mt5_tokenizer_data}, embedding_directory=embedding_directory)
+
+    def tokenize_with_weights(self, text:str, return_word_ids=False):
+        out = {}
+        out["hydit_clip"] = self.hydit_clip.tokenize_with_weights(text, return_word_ids)
+        out["mt5xl"] = self.mt5xl.tokenize_with_weights(text, return_word_ids)
+        return out
+
+    def untokenize(self, token_weight_pair):
+        return self.hydit_clip.untokenize(token_weight_pair)
+
+    def state_dict(self):
+        return {"mt5xl.spiece_model": self.mt5xl.state_dict()["spiece_model"]}
+
+class HyditModel(torch.nn.Module):
+    def __init__(self, device="cpu", dtype=None, model_options={}):
+        super().__init__()
+        self.hydit_clip = HyditBertModel(dtype=dtype, model_options=model_options)
+        self.mt5xl = MT5XLModel(dtype=dtype, model_options=model_options)
+
+        self.dtypes = set()
+        if dtype is not None:
+            self.dtypes.add(dtype)
+
+    def encode_token_weights(self, token_weight_pairs):
+        hydit_out = self.hydit_clip.encode_token_weights(token_weight_pairs["hydit_clip"])
+        mt5_out = self.mt5xl.encode_token_weights(token_weight_pairs["mt5xl"])
+        return hydit_out[0], hydit_out[1], {"attention_mask": hydit_out[2]["attention_mask"], "conditioning_mt5xl": mt5_out[0], "attention_mask_mt5xl": mt5_out[2]["attention_mask"]}
+
+    def load_sd(self, sd):
+        if "bert.encoder.layer.0.attention.self.query.weight" in sd:
+            return self.hydit_clip.load_sd(sd)
+        else:
+            return self.mt5xl.load_sd(sd)
+
+    def set_clip_options(self, options):
+        self.hydit_clip.set_clip_options(options)
+        self.mt5xl.set_clip_options(options)
+
+    def reset_clip_options(self):
+        self.hydit_clip.reset_clip_options()
+        self.mt5xl.reset_clip_options()
--- a/comfy/text_encoders/hydit_clip.json
+++ b/comfy/text_encoders/hydit_clip.json
@@ -0,0 +1,35 @@
+{
+  "_name_or_path": "hfl/chinese-roberta-wwm-ext-large",
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "directionality": "bidi",
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "output_past": true,
+  "pad_token_id": 0,
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128,
+  "pooler_type": "first_token_transform",
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.22.1",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 47020
+}
+
--- a/comfy/text_encoders/hydit_clip_tokenizer/special_tokens_map.json
+++ b/comfy/text_encoders/hydit_clip_tokenizer/special_tokens_map.json
@@ -0,0 +1,7 @@
+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}
--- a/comfy/text_encoders/hydit_clip_tokenizer/tokenizer_config.json
+++ b/comfy/text_encoders/hydit_clip_tokenizer/tokenizer_config.json
@@ -0,0 +1,16 @@
+{
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "name_or_path": "hfl/chinese-roberta-wwm-ext",
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "special_tokens_map_file": "/home/chenweifeng/.cache/huggingface/hub/models--hfl--chinese-roberta-wwm-ext/snapshots/5c58d0b8ec1d9014354d691c538661bf00bfdb44/special_tokens_map.json",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]",
+  "model_max_length": 77
+}
--- a/comfy/text_encoders/hydit_clip_tokenizer/vocab.txt
+++ b/comfy/text_encoders/hydit_clip_tokenizer/vocab.txt
--- a/comfy/text_encoders/long_clipl.json
+++ b/comfy/text_encoders/long_clipl.json
@@ -0,0 +1,25 @@
+{
+  "_name_or_path": "openai/clip-vit-large-patch14",
+  "architectures": [
+    "CLIPTextModel"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "dropout": 0.0,
+  "eos_token_id": 49407,
+  "hidden_act": "quick_gelu",
+  "hidden_size": 768,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 248,
+  "model_type": "clip_text_model",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "projection_dim": 768,
+  "torch_dtype": "float32",
+  "transformers_version": "4.24.0",
+  "vocab_size": 49408
+}
--- a/comfy/text_encoders/long_clipl.py
+++ b/comfy/text_encoders/long_clipl.py
@@ -0,0 +1,30 @@
+from comfy import sd1_clip
+import os
+
+class LongClipTokenizer_(sd1_clip.SDTokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        super().__init__(max_length=248, embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
+
+class LongClipModel_(sd1_clip.SDClipModel):
+    def __init__(self, *args, **kwargs):
+        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "long_clipl.json")
+        super().__init__(*args, textmodel_json_config=textmodel_json_config, **kwargs)
+
+class LongClipTokenizer(sd1_clip.SD1Tokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, tokenizer=LongClipTokenizer_)
+
+class LongClipModel(sd1_clip.SD1ClipModel):
+    def __init__(self, device="cpu", dtype=None, model_options={}, **kwargs):
+        super().__init__(device=device, dtype=dtype, model_options=model_options, clip_model=LongClipModel_, **kwargs)
+
+def model_options_long_clip(sd, tokenizer_data, model_options):
+    w = sd.get("clip_l.text_model.embeddings.position_embedding.weight", None)
+    if w is None:
+        w = sd.get("text_model.embeddings.position_embedding.weight", None)
+    if w is not None and w.shape[0] == 248:
+        tokenizer_data = tokenizer_data.copy()
+        model_options = model_options.copy()
+        tokenizer_data["clip_l_tokenizer_class"] = LongClipTokenizer_
+        model_options["clip_l_class"] = LongClipModel_
+    return tokenizer_data, model_options
--- a/comfy/text_encoders/mt5_config_xl.json
+++ b/comfy/text_encoders/mt5_config_xl.json
@@ -0,0 +1,22 @@
+{
+  "d_ff": 5120,
+  "d_kv": 64,
+  "d_model": 2048,
+  "decoder_start_token_id": 0,
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "dense_act_fn": "gelu_pytorch_tanh",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": true,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "mt5",
+  "num_decoder_layers": 24,
+  "num_heads": 32,
+  "num_layers": 24,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_num_buckets": 32,
+  "tie_word_embeddings": false,
+  "vocab_size": 250112
+}
--- a/comfy/text_encoders/sa_t5.py
+++ b/comfy/text_encoders/sa_t5.py
@@ -0,0 +1,22 @@
+from comfy import sd1_clip
+from transformers import T5TokenizerFast
+import comfy.text_encoders.t5
+import os
+
+class T5BaseModel(sd1_clip.SDClipModel):
+    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, model_options={}):
+        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_config_base.json")
+        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, model_options=model_options, special_tokens={"end": 1, "pad": 0}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=True, zero_out_masked=True)
+
+class T5BaseTokenizer(sd1_clip.SDTokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_tokenizer")
+        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=768, embedding_key='t5base', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=128)
+
+class SAT5Tokenizer(sd1_clip.SD1Tokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="t5base", tokenizer=T5BaseTokenizer)
+
+class SAT5Model(sd1_clip.SD1ClipModel):
+    def __init__(self, device="cpu", dtype=None, model_options={}, **kwargs):
+        super().__init__(device=device, dtype=dtype, model_options=model_options, name="t5base", clip_model=T5BaseModel, **kwargs)
--- a/comfy/text_encoders/sd2_clip.py
+++ b/comfy/text_encoders/sd2_clip.py
@@ -0,0 +1,23 @@
+from comfy import sd1_clip
+import os
+
+class SD2ClipHModel(sd1_clip.SDClipModel):
+    def __init__(self, arch="ViT-H-14", device="cpu", max_length=77, freeze=True, layer="penultimate", layer_idx=None, dtype=None, model_options={}):
+        if layer == "penultimate":
+            layer="hidden"
+            layer_idx=-2
+
+        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "sd2_clip_config.json")
+        super().__init__(device=device, freeze=freeze, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"start": 49406, "end": 49407, "pad": 0}, return_projected_pooled=True, model_options=model_options)
+
+class SD2ClipHTokenizer(sd1_clip.SDTokenizer):
+    def __init__(self, tokenizer_path=None, embedding_directory=None, tokenizer_data={}):
+        super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory, embedding_size=1024)
+
+class SD2Tokenizer(sd1_clip.SD1Tokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="h", tokenizer=SD2ClipHTokenizer)
+
+class SD2ClipModel(sd1_clip.SD1ClipModel):
+    def __init__(self, device="cpu", dtype=None, model_options={}, **kwargs):
+        super().__init__(device=device, dtype=dtype, model_options=model_options, clip_name="h", clip_model=SD2ClipHModel, **kwargs)
--- a/comfy/text_encoders/sd2_clip_config.json
+++ b/comfy/text_encoders/sd2_clip_config.json
@@ -0,0 +1,23 @@
+{
+  "architectures": [
+    "CLIPTextModel"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "dropout": 0.0,
+  "eos_token_id": 49407,
+  "hidden_act": "gelu",
+  "hidden_size": 1024,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 77,
+  "model_type": "clip_text_model",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "pad_token_id": 1,
+  "projection_dim": 1024,
+  "torch_dtype": "float32",
+  "vocab_size": 49408
+}
--- a/comfy/text_encoders/sd3_clip.py
+++ b/comfy/text_encoders/sd3_clip.py
@@ -0,0 +1,167 @@
+from comfy import sd1_clip
+from comfy import sdxl_clip
+from transformers import T5TokenizerFast
+import comfy.text_encoders.t5
+import torch
+import os
+import comfy.model_management
+import logging
+
+class T5XXLModel(sd1_clip.SDClipModel):
+    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=False, model_options={}):
+        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_config_xxl.json")
+        t5xxl_scaled_fp8 = model_options.get("t5xxl_scaled_fp8", None)
+        if t5xxl_scaled_fp8 is not None:
+            model_options = model_options.copy()
+            model_options["scaled_fp8"] = t5xxl_scaled_fp8
+
+        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
+
+
+def t5_xxl_detect(state_dict, prefix=""):
+    out = {}
+    t5_key = "{}encoder.final_layer_norm.weight".format(prefix)
+    if t5_key in state_dict:
+        out["dtype_t5"] = state_dict[t5_key].dtype
+
+    scaled_fp8_key = "{}scaled_fp8".format(prefix)
+    if scaled_fp8_key in state_dict:
+        out["t5xxl_scaled_fp8"] = state_dict[scaled_fp8_key].dtype
+
+    return out
+
+class T5XXLTokenizer(sd1_clip.SDTokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_tokenizer")
+        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=77)
+
+
+class SD3Tokenizer:
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        clip_l_tokenizer_class = tokenizer_data.get("clip_l_tokenizer_class", sd1_clip.SDTokenizer)
+        self.clip_l = clip_l_tokenizer_class(embedding_directory=embedding_directory)
+        self.clip_g = sdxl_clip.SDXLClipGTokenizer(embedding_directory=embedding_directory)
+        self.t5xxl = T5XXLTokenizer(embedding_directory=embedding_directory)
+
+    def tokenize_with_weights(self, text:str, return_word_ids=False):
+        out = {}
+        out["g"] = self.clip_g.tokenize_with_weights(text, return_word_ids)
+        out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids)
+        out["t5xxl"] = self.t5xxl.tokenize_with_weights(text, return_word_ids)
+        return out
+
+    def untokenize(self, token_weight_pair):
+        return self.clip_g.untokenize(token_weight_pair)
+
+    def state_dict(self):
+        return {}
+
+class SD3ClipModel(torch.nn.Module):
+    def __init__(self, clip_l=True, clip_g=True, t5=True, dtype_t5=None, t5_attention_mask=False, device="cpu", dtype=None, model_options={}):
+        super().__init__()
+        self.dtypes = set()
+        if clip_l:
+            clip_l_class = model_options.get("clip_l_class", sd1_clip.SDClipModel)
+            self.clip_l = clip_l_class(layer="hidden", layer_idx=-2, device=device, dtype=dtype, layer_norm_hidden_state=False, return_projected_pooled=False, model_options=model_options)
+            self.dtypes.add(dtype)
+        else:
+            self.clip_l = None
+
+        if clip_g:
+            self.clip_g = sdxl_clip.SDXLClipG(device=device, dtype=dtype, model_options=model_options)
+            self.dtypes.add(dtype)
+        else:
+            self.clip_g = None
+
+        if t5:
+            dtype_t5 = comfy.model_management.pick_weight_dtype(dtype_t5, dtype, device)
+            self.t5_attention_mask = t5_attention_mask
+            self.t5xxl = T5XXLModel(device=device, dtype=dtype_t5, model_options=model_options, attention_mask=self.t5_attention_mask)
+            self.dtypes.add(dtype_t5)
+        else:
+            self.t5xxl = None
+
+        logging.debug("Created SD3 text encoder with: clip_l {}, clip_g {}, t5xxl {}:{}".format(clip_l, clip_g, t5, dtype_t5))
+
+    def set_clip_options(self, options):
+        if self.clip_l is not None:
+            self.clip_l.set_clip_options(options)
+        if self.clip_g is not None:
+            self.clip_g.set_clip_options(options)
+        if self.t5xxl is not None:
+            self.t5xxl.set_clip_options(options)
+
+    def reset_clip_options(self):
+        if self.clip_l is not None:
+            self.clip_l.reset_clip_options()
+        if self.clip_g is not None:
+            self.clip_g.reset_clip_options()
+        if self.t5xxl is not None:
+            self.t5xxl.reset_clip_options()
+
+    def encode_token_weights(self, token_weight_pairs):
+        token_weight_pairs_l = token_weight_pairs["l"]
+        token_weight_pairs_g = token_weight_pairs["g"]
+        token_weight_pairs_t5 = token_weight_pairs["t5xxl"]
+        lg_out = None
+        pooled = None
+        out = None
+        extra = {}
+
+        if len(token_weight_pairs_g) > 0 or len(token_weight_pairs_l) > 0:
+            if self.clip_l is not None:
+                lg_out, l_pooled = self.clip_l.encode_token_weights(token_weight_pairs_l)
+            else:
+                l_pooled = torch.zeros((1, 768), device=comfy.model_management.intermediate_device())
+
+            if self.clip_g is not None:
+                g_out, g_pooled = self.clip_g.encode_token_weights(token_weight_pairs_g)
+                if lg_out is not None:
+                    cut_to = min(lg_out.shape[1], g_out.shape[1])
+                    lg_out = torch.cat([lg_out[:,:cut_to], g_out[:,:cut_to]], dim=-1)
+                else:
+                    lg_out = torch.nn.functional.pad(g_out, (768, 0))
+            else:
+                g_out = None
+                g_pooled = torch.zeros((1, 1280), device=comfy.model_management.intermediate_device())
+
+            if lg_out is not None:
+                lg_out = torch.nn.functional.pad(lg_out, (0, 4096 - lg_out.shape[-1]))
+                out = lg_out
+            pooled = torch.cat((l_pooled, g_pooled), dim=-1)
+
+        if self.t5xxl is not None:
+            t5_output = self.t5xxl.encode_token_weights(token_weight_pairs_t5)
+            t5_out, t5_pooled = t5_output[:2]
+            if self.t5_attention_mask:
+                extra["attention_mask"] = t5_output[2]["attention_mask"]
+
+            if lg_out is not None:
+                out = torch.cat([lg_out, t5_out], dim=-2)
+            else:
+                out = t5_out
+
+        if out is None:
+            out = torch.zeros((1, 77, 4096), device=comfy.model_management.intermediate_device())
+
+        if pooled is None:
+            pooled = torch.zeros((1, 768 + 1280), device=comfy.model_management.intermediate_device())
+
+        return out, pooled, extra
+
+    def load_sd(self, sd):
+        if "text_model.encoder.layers.30.mlp.fc1.weight" in sd:
+            return self.clip_g.load_sd(sd)
+        elif "text_model.encoder.layers.1.mlp.fc1.weight" in sd:
+            return self.clip_l.load_sd(sd)
+        else:
+            return self.t5xxl.load_sd(sd)
+
+def sd3_clip(clip_l=True, clip_g=True, t5=True, dtype_t5=None, t5xxl_scaled_fp8=None, t5_attention_mask=False):
+    class SD3ClipModel_(SD3ClipModel):
+        def __init__(self, device="cpu", dtype=None, model_options={}):
+            if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options:
+                model_options = model_options.copy()
+                model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8
+            super().__init__(clip_l=clip_l, clip_g=clip_g, t5=t5, dtype_t5=dtype_t5, t5_attention_mask=t5_attention_mask, device=device, dtype=dtype, model_options=model_options)
+    return SD3ClipModel_
--- a/comfy/text_encoders/spiece_tokenizer.py
+++ b/comfy/text_encoders/spiece_tokenizer.py
@@ -0,0 +1,32 @@
+import os
+import torch
+
+class SPieceTokenizer:
+    add_eos = True
+
+    @staticmethod
+    def from_pretrained(path):
+        return SPieceTokenizer(path)
+
+    def __init__(self, tokenizer_path):
+        import sentencepiece
+        if torch.is_tensor(tokenizer_path):
+            tokenizer_path = tokenizer_path.numpy().tobytes()
+
+        if isinstance(tokenizer_path, bytes):
+            self.tokenizer = sentencepiece.SentencePieceProcessor(model_proto=tokenizer_path, add_eos=self.add_eos)
+        else:
+            self.tokenizer = sentencepiece.SentencePieceProcessor(model_file=tokenizer_path, add_eos=self.add_eos)
+
+    def get_vocab(self):
+        out = {}
+        for i in range(self.tokenizer.get_piece_size()):
+            out[self.tokenizer.id_to_piece(i)] = i
+        return out
+
+    def __call__(self, string):
+        out = self.tokenizer.encode(string)
+        return {"input_ids": out}
+
+    def serialize_model(self):
+        return torch.ByteTensor(list(self.tokenizer.serialized_model_proto()))
--- a/comfy/text_encoders/t5.py
+++ b/comfy/text_encoders/t5.py
@@ -0,0 +1,241 @@
+import torch
+import math
+from comfy.ldm.modules.attention import optimized_attention_for_device
+import comfy.ops
+
+class T5LayerNorm(torch.nn.Module):
+    def __init__(self, hidden_size, eps=1e-6, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.empty(hidden_size, dtype=dtype, device=device))
+        self.variance_epsilon = eps
+
+    def forward(self, x):
+        variance = x.pow(2).mean(-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        return comfy.ops.cast_to_input(self.weight, x) * x
+
+activations = {
+    "gelu_pytorch_tanh": lambda a: torch.nn.functional.gelu(a, approximate="tanh"),
+    "relu": torch.nn.functional.relu,
+}
+
+class T5DenseActDense(torch.nn.Module):
+    def __init__(self, model_dim, ff_dim, ff_activation, dtype, device, operations):
+        super().__init__()
+        self.wi = operations.Linear(model_dim, ff_dim, bias=False, dtype=dtype, device=device)
+        self.wo = operations.Linear(ff_dim, model_dim, bias=False, dtype=dtype, device=device)
+        # self.dropout = nn.Dropout(config.dropout_rate)
+        self.act = activations[ff_activation]
+
+    def forward(self, x):
+        x = self.act(self.wi(x))
+        # x = self.dropout(x)
+        x = self.wo(x)
+        return x
+
+class T5DenseGatedActDense(torch.nn.Module):
+    def __init__(self, model_dim, ff_dim, ff_activation, dtype, device, operations):
+        super().__init__()
+        self.wi_0 = operations.Linear(model_dim, ff_dim, bias=False, dtype=dtype, device=device)
+        self.wi_1 = operations.Linear(model_dim, ff_dim, bias=False, dtype=dtype, device=device)
+        self.wo = operations.Linear(ff_dim, model_dim, bias=False, dtype=dtype, device=device)
+        # self.dropout = nn.Dropout(config.dropout_rate)
+        self.act = activations[ff_activation]
+
+    def forward(self, x):
+        hidden_gelu = self.act(self.wi_0(x))
+        hidden_linear = self.wi_1(x)
+        x = hidden_gelu * hidden_linear
+        # x = self.dropout(x)
+        x = self.wo(x)
+        return x
+
+class T5LayerFF(torch.nn.Module):
+    def __init__(self, model_dim, ff_dim, ff_activation, gated_act, dtype, device, operations):
+        super().__init__()
+        if gated_act:
+            self.DenseReluDense = T5DenseGatedActDense(model_dim, ff_dim, ff_activation, dtype, device, operations)
+        else:
+            self.DenseReluDense = T5DenseActDense(model_dim, ff_dim, ff_activation, dtype, device, operations)
+
+        self.layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device, operations=operations)
+        # self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, x):
+        forwarded_states = self.layer_norm(x)
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        # x = x + self.dropout(forwarded_states)
+        x += forwarded_states
+        return x
+
+class T5Attention(torch.nn.Module):
+    def __init__(self, model_dim, inner_dim, num_heads, relative_attention_bias, dtype, device, operations):
+        super().__init__()
+
+        # Mesh TensorFlow initialization to avoid scaling before softmax
+        self.q = operations.Linear(model_dim, inner_dim, bias=False, dtype=dtype, device=device)
+        self.k = operations.Linear(model_dim, inner_dim, bias=False, dtype=dtype, device=device)
+        self.v = operations.Linear(model_dim, inner_dim, bias=False, dtype=dtype, device=device)
+        self.o = operations.Linear(inner_dim, model_dim, bias=False, dtype=dtype, device=device)
+        self.num_heads = num_heads
+
+        self.relative_attention_bias = None
+        if relative_attention_bias:
+            self.relative_attention_num_buckets = 32
+            self.relative_attention_max_distance = 128
+            self.relative_attention_bias = operations.Embedding(self.relative_attention_num_buckets, self.num_heads, device=device, dtype=dtype)
+
+    @staticmethod
+    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
+        """
+        Adapted from Mesh Tensorflow:
+        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+
+        Translate relative position to a bucket number for relative attention. The relative position is defined as
+        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
+        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
+        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
+        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
+        This should allow for more graceful generalization to longer sequences than the model has been trained on
+
+        Args:
+            relative_position: an int32 Tensor
+            bidirectional: a boolean - whether the attention is bidirectional
+            num_buckets: an integer
+            max_distance: an integer
+
+        Returns:
+            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
+        """
+        relative_buckets = 0
+        if bidirectional:
+            num_buckets //= 2
+            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
+            relative_position = torch.abs(relative_position)
+        else:
+            relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
+        # now relative_position is in the range [0, inf)
+
+        # half of the buckets are for exact increments in positions
+        max_exact = num_buckets // 2
+        is_small = relative_position < max_exact
+
+        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+        relative_position_if_large = max_exact + (
+            torch.log(relative_position.float() / max_exact)
+            / math.log(max_distance / max_exact)
+            * (num_buckets - max_exact)
+        ).to(torch.long)
+        relative_position_if_large = torch.min(
+            relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
+        )
+
+        relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
+        return relative_buckets
+
+    def compute_bias(self, query_length, key_length, device, dtype):
+        """Compute binned relative position bias"""
+        context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
+        memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
+        relative_position = memory_position - context_position  # shape (query_length, key_length)
+        relative_position_bucket = self._relative_position_bucket(
+            relative_position,  # shape (query_length, key_length)
+            bidirectional=True,
+            num_buckets=self.relative_attention_num_buckets,
+            max_distance=self.relative_attention_max_distance,
+        )
+        values = self.relative_attention_bias(relative_position_bucket, out_dtype=dtype)  # shape (query_length, key_length, num_heads)
+        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
+        return values
+
+    def forward(self, x, mask=None, past_bias=None, optimized_attention=None):
+        q = self.q(x)
+        k = self.k(x)
+        v = self.v(x)
+        if self.relative_attention_bias is not None:
+            past_bias = self.compute_bias(x.shape[1], x.shape[1], x.device, x.dtype)
+
+        if past_bias is not None:
+            if mask is not None:
+                mask = mask + past_bias
+            else:
+                mask = past_bias
+
+        out = optimized_attention(q, k * ((k.shape[-1] / self.num_heads) ** 0.5), v, self.num_heads, mask)
+        return self.o(out), past_bias
+
+class T5LayerSelfAttention(torch.nn.Module):
+    def __init__(self, model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias, dtype, device, operations):
+        super().__init__()
+        self.SelfAttention = T5Attention(model_dim, inner_dim, num_heads, relative_attention_bias, dtype, device, operations)
+        self.layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device, operations=operations)
+        # self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, x, mask=None, past_bias=None, optimized_attention=None):
+        normed_hidden_states = self.layer_norm(x)
+        output, past_bias = self.SelfAttention(self.layer_norm(x), mask=mask, past_bias=past_bias, optimized_attention=optimized_attention)
+        # x = x + self.dropout(attention_output)
+        x += output
+        return x, past_bias
+
+class T5Block(torch.nn.Module):
+    def __init__(self, model_dim, inner_dim, ff_dim, ff_activation, gated_act, num_heads, relative_attention_bias, dtype, device, operations):
+        super().__init__()
+        self.layer = torch.nn.ModuleList()
+        self.layer.append(T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias, dtype, device, operations))
+        self.layer.append(T5LayerFF(model_dim, ff_dim, ff_activation, gated_act, dtype, device, operations))
+
+    def forward(self, x, mask=None, past_bias=None, optimized_attention=None):
+        x, past_bias = self.layer[0](x, mask, past_bias, optimized_attention)
+        x = self.layer[-1](x)
+        return x, past_bias
+
+class T5Stack(torch.nn.Module):
+    def __init__(self, num_layers, model_dim, inner_dim, ff_dim, ff_activation, gated_act, num_heads, relative_attention, dtype, device, operations):
+        super().__init__()
+
+        self.block = torch.nn.ModuleList(
+            [T5Block(model_dim, inner_dim, ff_dim, ff_activation, gated_act, num_heads, relative_attention_bias=((not relative_attention) or (i == 0)), dtype=dtype, device=device, operations=operations) for i in range(num_layers)]
+        )
+        self.final_layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device, operations=operations)
+        # self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, x, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None):
+        mask = None
+        if attention_mask is not None:
+            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
+            mask = mask.masked_fill(mask.to(torch.bool), float("-inf"))
+
+        intermediate = None
+        optimized_attention = optimized_attention_for_device(x.device, mask=attention_mask is not None, small_input=True)
+        past_bias = None
+        for i, l in enumerate(self.block):
+            x, past_bias = l(x, mask, past_bias, optimized_attention)
+            if i == intermediate_output:
+                intermediate = x.clone()
+        x = self.final_layer_norm(x)
+        if intermediate is not None and final_layer_norm_intermediate:
+            intermediate = self.final_layer_norm(intermediate)
+        return x, intermediate
+
+class T5(torch.nn.Module):
+    def __init__(self, config_dict, dtype, device, operations):
+        super().__init__()
+        self.num_layers = config_dict["num_layers"]
+        model_dim = config_dict["d_model"]
+
+        self.encoder = T5Stack(self.num_layers, model_dim, model_dim, config_dict["d_ff"], config_dict["dense_act_fn"], config_dict["is_gated_act"], config_dict["num_heads"], config_dict["model_type"] != "umt5", dtype, device, operations)
+        self.dtype = dtype
+        self.shared = operations.Embedding(config_dict["vocab_size"], model_dim, device=device, dtype=dtype)
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, embeddings):
+        self.shared = embeddings
+
+    def forward(self, input_ids, *args, **kwargs):
+        x = self.shared(input_ids, out_dtype=kwargs.get("dtype", torch.float32))
+        if self.dtype not in [torch.float32, torch.float16, torch.bfloat16]:
+            x = torch.nan_to_num(x) #Fix for fp8 T5 base
+        return self.encoder(x, *args, **kwargs)
--- a/comfy/text_encoders/t5_config_base.json
+++ b/comfy/text_encoders/t5_config_base.json
@@ -0,0 +1,22 @@
+{
+  "d_ff": 3072,
+  "d_kv": 64,
+  "d_model": 768,
+  "decoder_start_token_id": 0,
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "dense_act_fn": "relu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "num_decoder_layers": 12,
+  "num_heads": 12,
+  "num_layers": 12,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_num_buckets": 32,
+  "tie_word_embeddings": false,
+  "vocab_size": 32128
+}
--- a/comfy/text_encoders/t5_config_xxl.json
+++ b/comfy/text_encoders/t5_config_xxl.json
@@ -0,0 +1,22 @@
+{
+  "d_ff": 10240,
+  "d_kv": 64,
+  "d_model": 4096,
+  "decoder_start_token_id": 0,
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "dense_act_fn": "gelu_pytorch_tanh",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": true,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "num_decoder_layers": 24,
+  "num_heads": 64,
+  "num_layers": 24,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_num_buckets": 32,
+  "tie_word_embeddings": false,
+  "vocab_size": 32128
+}
--- a/comfy/text_encoders/t5_pile_config_xl.json
+++ b/comfy/text_encoders/t5_pile_config_xl.json
@@ -0,0 +1,22 @@
+{
+  "d_ff": 5120,
+  "d_kv": 64,
+  "d_model": 2048,
+  "decoder_start_token_id": 0,
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "dense_act_fn": "gelu_pytorch_tanh",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": true,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "umt5",
+  "num_decoder_layers": 24,
+  "num_heads": 32,
+  "num_layers": 24,
+  "output_past": true,
+  "pad_token_id": 1,
+  "relative_attention_num_buckets": 32,
+  "tie_word_embeddings": false,
+  "vocab_size": 32128
+}
--- a/comfy/text_encoders/t5_pile_tokenizer/tokenizer.model
+++ b/comfy/text_encoders/t5_pile_tokenizer/tokenizer.model
--- a/comfy/text_encoders/t5_tokenizer/special_tokens_map.json
+++ b/comfy/text_encoders/t5_tokenizer/special_tokens_map.json
@@ -0,0 +1,125 @@
+{
+  "additional_special_tokens": [
+    "<extra_id_0>",
+    "<extra_id_1>",
+    "<extra_id_2>",
+    "<extra_id_3>",
+    "<extra_id_4>",
+    "<extra_id_5>",
+    "<extra_id_6>",
+    "<extra_id_7>",
+    "<extra_id_8>",
+    "<extra_id_9>",
+    "<extra_id_10>",
+    "<extra_id_11>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>",
+    "<extra_id_64>",
+    "<extra_id_65>",
+    "<extra_id_66>",
+    "<extra_id_67>",
+    "<extra_id_68>",
+    "<extra_id_69>",
+    "<extra_id_70>",
+    "<extra_id_71>",
+    "<extra_id_72>",
+    "<extra_id_73>",
+    "<extra_id_74>",
+    "<extra_id_75>",
+    "<extra_id_76>",
+    "<extra_id_77>",
+    "<extra_id_78>",
+    "<extra_id_79>",
+    "<extra_id_80>",
+    "<extra_id_81>",
+    "<extra_id_82>",
+    "<extra_id_83>",
+    "<extra_id_84>",
+    "<extra_id_85>",
+    "<extra_id_86>",
+    "<extra_id_87>",
+    "<extra_id_88>",
+    "<extra_id_89>",
+    "<extra_id_90>",
+    "<extra_id_91>",
+    "<extra_id_92>",
+    "<extra_id_93>",
+    "<extra_id_94>",
+    "<extra_id_95>",
+    "<extra_id_96>",
+    "<extra_id_97>",
+    "<extra_id_98>",
+    "<extra_id_99>"
+  ],
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
--- a/comfy/text_encoders/t5_tokenizer/tokenizer.json
+++ b/comfy/text_encoders/t5_tokenizer/tokenizer.json
--- a/comfy/text_encoders/t5_tokenizer/tokenizer_config.json
+++ b/comfy/text_encoders/t5_tokenizer/tokenizer_config.json
@@ -0,0 +1,939 @@
+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "<extra_id_99>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<extra_id_98>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32002": {
+      "content": "<extra_id_97>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32003": {
+      "content": "<extra_id_96>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32004": {
+      "content": "<extra_id_95>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32005": {
+      "content": "<extra_id_94>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32006": {
+      "content": "<extra_id_93>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "<extra_id_92>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "<extra_id_91>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "<extra_id_90>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "<extra_id_89>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32011": {
+      "content": "<extra_id_88>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32012": {
+      "content": "<extra_id_87>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32013": {
+      "content": "<extra_id_86>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32014": {
+      "content": "<extra_id_85>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32015": {
+      "content": "<extra_id_84>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "<extra_id_83>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32017": {
+      "content": "<extra_id_82>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32018": {
+      "content": "<extra_id_81>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32019": {
+      "content": "<extra_id_80>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32020": {
+      "content": "<extra_id_79>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32021": {
+      "content": "<extra_id_78>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32022": {
+      "content": "<extra_id_77>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32023": {
+      "content": "<extra_id_76>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32024": {
+      "content": "<extra_id_75>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32025": {
+      "content": "<extra_id_74>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32026": {
+      "content": "<extra_id_73>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32027": {
+      "content": "<extra_id_72>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32028": {
+      "content": "<extra_id_71>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32029": {
+      "content": "<extra_id_70>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32030": {
+      "content": "<extra_id_69>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32031": {
+      "content": "<extra_id_68>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32032": {
+      "content": "<extra_id_67>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32033": {
+      "content": "<extra_id_66>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32034": {
+      "content": "<extra_id_65>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32035": {
+      "content": "<extra_id_64>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32036": {
+      "content": "<extra_id_63>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32037": {
+      "content": "<extra_id_62>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32038": {
+      "content": "<extra_id_61>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32039": {
+      "content": "<extra_id_60>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32040": {
+      "content": "<extra_id_59>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32041": {
+      "content": "<extra_id_58>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32042": {
+      "content": "<extra_id_57>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32043": {
+      "content": "<extra_id_56>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32044": {
+      "content": "<extra_id_55>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32045": {
+      "content": "<extra_id_54>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32046": {
+      "content": "<extra_id_53>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32047": {
+      "content": "<extra_id_52>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32048": {
+      "content": "<extra_id_51>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32049": {
+      "content": "<extra_id_50>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32050": {
+      "content": "<extra_id_49>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32051": {
+      "content": "<extra_id_48>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32052": {
+      "content": "<extra_id_47>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32053": {
+      "content": "<extra_id_46>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32054": {
+      "content": "<extra_id_45>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32055": {
+      "content": "<extra_id_44>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32056": {
+      "content": "<extra_id_43>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32057": {
+      "content": "<extra_id_42>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32058": {
+      "content": "<extra_id_41>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32059": {
+      "content": "<extra_id_40>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32060": {
+      "content": "<extra_id_39>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32061": {
+      "content": "<extra_id_38>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32062": {
+      "content": "<extra_id_37>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32063": {
+      "content": "<extra_id_36>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32064": {
+      "content": "<extra_id_35>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32065": {
+      "content": "<extra_id_34>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32066": {
+      "content": "<extra_id_33>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32067": {
+      "content": "<extra_id_32>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32068": {
+      "content": "<extra_id_31>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32069": {
+      "content": "<extra_id_30>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32070": {
+      "content": "<extra_id_29>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32071": {
+      "content": "<extra_id_28>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32072": {
+      "content": "<extra_id_27>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32073": {
+      "content": "<extra_id_26>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32074": {
+      "content": "<extra_id_25>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32075": {
+      "content": "<extra_id_24>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32076": {
+      "content": "<extra_id_23>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32077": {
+      "content": "<extra_id_22>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32078": {
+      "content": "<extra_id_21>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32079": {
+      "content": "<extra_id_20>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32080": {
+      "content": "<extra_id_19>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32081": {
+      "content": "<extra_id_18>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32082": {
+      "content": "<extra_id_17>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32083": {
+      "content": "<extra_id_16>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32084": {
+      "content": "<extra_id_15>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32085": {
+      "content": "<extra_id_14>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32086": {
+      "content": "<extra_id_13>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32087": {
+      "content": "<extra_id_12>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32088": {
+      "content": "<extra_id_11>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32089": {
+      "content": "<extra_id_10>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32090": {
+      "content": "<extra_id_9>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32091": {
+      "content": "<extra_id_8>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32092": {
+      "content": "<extra_id_7>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32093": {
+      "content": "<extra_id_6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32094": {
+      "content": "<extra_id_5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32095": {
+      "content": "<extra_id_4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32096": {
+      "content": "<extra_id_3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32097": {
+      "content": "<extra_id_2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32098": {
+      "content": "<extra_id_1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32099": {
+      "content": "<extra_id_0>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<extra_id_0>",
+    "<extra_id_1>",
+    "<extra_id_2>",
+    "<extra_id_3>",
+    "<extra_id_4>",
+    "<extra_id_5>",
+    "<extra_id_6>",
+    "<extra_id_7>",
+    "<extra_id_8>",
+    "<extra_id_9>",
+    "<extra_id_10>",
+    "<extra_id_11>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>",
+    "<extra_id_64>",
+    "<extra_id_65>",
+    "<extra_id_66>",
+    "<extra_id_67>",
+    "<extra_id_68>",
+    "<extra_id_69>",
+    "<extra_id_70>",
+    "<extra_id_71>",
+    "<extra_id_72>",
+    "<extra_id_73>",
+    "<extra_id_74>",
+    "<extra_id_75>",
+    "<extra_id_76>",
+    "<extra_id_77>",
+    "<extra_id_78>",
+    "<extra_id_79>",
+    "<extra_id_80>",
+    "<extra_id_81>",
+    "<extra_id_82>",
+    "<extra_id_83>",
+    "<extra_id_84>",
+    "<extra_id_85>",
+    "<extra_id_86>",
+    "<extra_id_87>",
+    "<extra_id_88>",
+    "<extra_id_89>",
+    "<extra_id_90>",
+    "<extra_id_91>",
+    "<extra_id_92>",
+    "<extra_id_93>",
+    "<extra_id_94>",
+    "<extra_id_95>",
+    "<extra_id_96>",
+    "<extra_id_97>",
+    "<extra_id_98>",
+    "<extra_id_99>"
+  ],
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "</s>",
+  "extra_ids": 100,
+  "legacy": false,
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "T5Tokenizer",
+  "unk_token": "<unk>"
+}