diff --git a/opensora/models/stdit/stdit2.py b/opensora/models/stdit/stdit2.py index afcc0c12..5de17694 100644 --- a/opensora/models/stdit/stdit2.py +++ b/opensora/models/stdit/stdit2.py @@ -1,6 +1,7 @@ import numpy as np import torch import torch.nn as nn +import os from einops import rearrange from rotary_embedding_torch import RotaryEmbedding from timm.models.layers import DropPath @@ -23,6 +24,7 @@ ) from opensora.registry import MODELS from transformers import PretrainedConfig, PreTrainedModel +from opensora.utils.ckpt_utils import load_checkpoint class STDiT2Block(nn.Module): @@ -502,8 +504,22 @@ def _basic_init(module): @MODELS.register_module("STDiT2-XL/2") def STDiT2_XL_2(from_pretrained=None, **kwargs): if from_pretrained is not None: - model = STDiT2.from_pretrained(from_pretrained, **kwargs) + if os.path.isdir(from_pretrained) or os.path.isfile(from_pretrained): + # if it is a directory or a file, we load the checkpoint manually + config = STDiT2Config( + depth=28, + hidden_size=1152, + patch_size=(1, 2, 2), + num_heads=16, **kwargs + ) + model = STDiT2(config) + load_checkpoint(model, from_pretrained) + return model + else: + # otherwise, we load the model from hugging face hub + return STDiT2.from_pretrained(from_pretrained) else: + # create a new model config = STDiT2Config( depth=28, hidden_size=1152, diff --git a/scripts/train.py b/scripts/train.py index aed7b452..bfb6d39c 100644 --- a/scripts/train.py +++ b/scripts/train.py @@ -133,8 +133,7 @@ def main(): input_size=latent_size, in_channels=vae.out_channels, caption_channels=text_encoder.output_dim, - model_max_length=text_encoder.model_max_length, - dtype=dtype, + model_max_length=text_encoder.model_max_length ) model_numel, model_numel_trainable = get_model_numel(model) logger.info(