huggingface
diff --git a/‎scripts/convert_cogview3_to_diffusers.py
Lines changed: 183 additions & 0 deletions b/‎scripts/convert_cogview3_to_diffusers.py
Lines changed: 183 additions & 0 deletions
diff --git a/‎show_model.py
Lines changed: 91 additions & 0 deletions b/‎show_model.py
Lines changed: 91 additions & 0 deletions
diff --git a/‎show_model_cogview.py
Lines changed: 25 additions & 0 deletions b/‎show_model_cogview.py
Lines changed: 25 additions & 0 deletions
diff --git a/‎src/diffusers/__init__.py
Lines changed: 4 additions & 0 deletions b/‎src/diffusers/__init__.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/diffusers/models/__init__.py
Lines changed: 2 additions & 0 deletions b/‎src/diffusers/models/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/diffusers/models/attention_processor.py
Lines changed: 3 additions & 2 deletions b/‎src/diffusers/models/attention_processor.py
Lines changed: 3 additions & 2 deletions
@@ -0,0 +1,183 @@
+"""
+Convert a CogView3 checkpoint to the Diffusers format.
+
+This script converts a CogView3 checkpoint to the Diffusers format, which can then be used
+with the Diffusers library.
+
+Example usage:
+    python scripts/convert_cogview3_to_diffusers.py \
+        --original_state_dict_repo_id "THUDM/cogview3" \
+        --filename "cogview3.pt" \
+        --transformer \
+        --output_path "./cogview3_diffusers" \
+        --dtype "bf16"
+
+Alternatively, if you have a local checkpoint:
+    python scripts/convert_cogview3_to_diffusers.py \
+        --checkpoint_path '/raid/.cache/huggingface/models--ZP2HF--CogView3-SAT/snapshots/ca86ce9ba94f9a7f2dd109e7a59e4c8ad04121be/cogview3plus_3b/1/mp_rank_00_model_states.pt' \
+        --transformer \
+        --output_path "/raid/yiyi/cogview3_diffusers" \
+        --dtype "bf16"
+
+Arguments:
+    --original_state_dict_repo_id: The Hugging Face repo ID containing the original checkpoint.
+    --filename: The filename of the checkpoint in the repo (default: "flux.safetensors").
+    --checkpoint_path: Path to a local checkpoint file (alternative to repo_id and filename).
+    --transformer: Flag to convert the transformer model.
+    --output_path: The path to save the converted model.
+    --dtype: The dtype to save the model in (default: "bf16", options: "fp16", "bf16", "fp32").
+
+Note: You must provide either --original_state_dict_repo_id or --checkpoint_path.
+"""
+
+import argparse
+from contextlib import nullcontext
+
+import torch
+from accelerate import init_empty_weights
+from huggingface_hub import hf_hub_download
+
+from diffusers import CogView3PlusTransformer2DModel
+from diffusers.utils.import_utils import is_accelerate_available
+
+
+CTX = init_empty_weights if is_accelerate_available else nullcontext
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--original_state_dict_repo_id", default=None, type=str)
+parser.add_argument("--filename", default="flux.safetensors", type=str)
+parser.add_argument("--checkpoint_path", default=None, type=str)
+parser.add_argument("--transformer", action="store_true")
+parser.add_argument("--output_path", type=str)
+parser.add_argument("--dtype", type=str, default="bf16")
+
+args = parser.parse_args()
+
+
+def load_original_checkpoint(args):
+    if args.original_state_dict_repo_id is not None:
+        ckpt_path = hf_hub_download(repo_id=args.original_state_dict_repo_id, filename=args.filename)
+    elif args.checkpoint_path is not None:
+        ckpt_path = args.checkpoint_path
+    else:
+        raise ValueError("Please provide either `original_state_dict_repo_id` or a local `checkpoint_path`")
+
+    original_state_dict = torch.load(ckpt_path, map_location="cpu")
+    return original_state_dict
+
+
+# this is specific to `AdaLayerNormContinuous`:
+# diffusers imnplementation split the linear projection into the scale, shift while CogView3 split it tino shift, scale
+def swap_scale_shift(weight, dim):
+    shift, scale = weight.chunk(2, dim=0)
+    new_weight = torch.cat([scale, shift], dim=0)
+    return new_weight
+
+
+def convert_cogview3_transformer_checkpoint_to_diffusers(original_state_dict):
+    new_state_dict = {}
+
+    # Convert pos_embed
+    new_state_dict["pos_embed.proj.weight"] = original_state_dict.pop("mixins.patch_embed.proj.weight")
+    new_state_dict["pos_embed.proj.bias"] = original_state_dict.pop("mixins.patch_embed.proj.bias")
+    new_state_dict["pos_embed.text_proj.weight"] = original_state_dict.pop("mixins.patch_embed.text_proj.weight")
+    new_state_dict["pos_embed.text_proj.bias"] = original_state_dict.pop("mixins.patch_embed.text_proj.bias")
+
+    # Convert time_text_embed
+    new_state_dict["time_text_embed.timestep_embedder.linear_1.weight"] = original_state_dict.pop(
+        "time_embed.0.weight"
+    )
+    new_state_dict["time_text_embed.timestep_embedder.linear_1.bias"] = original_state_dict.pop("time_embed.0.bias")
+    new_state_dict["time_text_embed.timestep_embedder.linear_2.weight"] = original_state_dict.pop(
+        "time_embed.2.weight"
+    )
+    new_state_dict["time_text_embed.timestep_embedder.linear_2.bias"] = original_state_dict.pop("time_embed.2.bias")
+    new_state_dict["time_text_embed.text_embedder.linear_1.weight"] = original_state_dict.pop("label_emb.0.0.weight")
+    new_state_dict["time_text_embed.text_embedder.linear_1.bias"] = original_state_dict.pop("label_emb.0.0.bias")
+    new_state_dict["time_text_embed.text_embedder.linear_2.weight"] = original_state_dict.pop("label_emb.0.2.weight")
+    new_state_dict["time_text_embed.text_embedder.linear_2.bias"] = original_state_dict.pop("label_emb.0.2.bias")
+
+    # Convert transformer blocks
+    for i in range(30):
+        block_prefix = f"transformer_blocks.{i}."
+        old_prefix = f"transformer.layers.{i}."
+        adaln_prefix = f"mixins.adaln.adaln_modules.{i}."
+
+        new_state_dict[block_prefix + "norm1.linear.weight"] = original_state_dict.pop(adaln_prefix + "1.weight")
+        new_state_dict[block_prefix + "norm1.linear.bias"] = original_state_dict.pop(adaln_prefix + "1.bias")
+
+        qkv_weight = original_state_dict.pop(old_prefix + "attention.query_key_value.weight")
+        qkv_bias = original_state_dict.pop(old_prefix + "attention.query_key_value.bias")
+        q, k, v = qkv_weight.chunk(3, dim=0)
+        q_bias, k_bias, v_bias = qkv_bias.chunk(3, dim=0)
+
+        new_state_dict[block_prefix + "attn.to_q.weight"] = q
+        new_state_dict[block_prefix + "attn.to_q.bias"] = q_bias
+        new_state_dict[block_prefix + "attn.to_k.weight"] = k
+        new_state_dict[block_prefix + "attn.to_k.bias"] = k_bias
+        new_state_dict[block_prefix + "attn.to_v.weight"] = v
+        new_state_dict[block_prefix + "attn.to_v.bias"] = v_bias
+
+        new_state_dict[block_prefix + "attn.to_out.0.weight"] = original_state_dict.pop(
+            old_prefix + "attention.dense.weight"
+        )
+        new_state_dict[block_prefix + "attn.to_out.0.bias"] = original_state_dict.pop(
+            old_prefix + "attention.dense.bias"
+        )
+
+        new_state_dict[block_prefix + "ff.net.0.proj.weight"] = original_state_dict.pop(
+            old_prefix + "mlp.dense_h_to_4h.weight"
+        )
+        new_state_dict[block_prefix + "ff.net.0.proj.bias"] = original_state_dict.pop(
+            old_prefix + "mlp.dense_h_to_4h.bias"
+        )
+        new_state_dict[block_prefix + "ff.net.2.weight"] = original_state_dict.pop(
+            old_prefix + "mlp.dense_4h_to_h.weight"
+        )
+        new_state_dict[block_prefix + "ff.net.2.bias"] = original_state_dict.pop(old_prefix + "mlp.dense_4h_to_h.bias")
+
+    # Convert final norm and projection
+    new_state_dict["norm_out.linear.weight"] = swap_scale_shift(
+        original_state_dict.pop("mixins.final_layer.adaln.1.weight"), dim=0
+    )
+    new_state_dict["norm_out.linear.bias"] = swap_scale_shift(
+        original_state_dict.pop("mixins.final_layer.adaln.1.bias"), dim=0
+    )
+    new_state_dict["proj_out.weight"] = original_state_dict.pop("mixins.final_layer.linear.weight")
+    new_state_dict["proj_out.bias"] = original_state_dict.pop("mixins.final_layer.linear.bias")
+
+    return new_state_dict
+
+
+def main(args):
+    original_ckpt = load_original_checkpoint(args)
+    original_ckpt = original_ckpt["module"]
+    original_ckpt = {k.replace("model.diffusion_model.", ""): v for k, v in original_ckpt.items()}
+
+    original_dtype = next(iter(original_ckpt.values())).dtype
+    dtype = None
+    if args.dtype is None:
+        dtype = original_dtype
+    elif args.dtype == "fp16":
+        dtype = torch.float16
+    elif args.dtype == "bf16":
+        dtype = torch.bfloat16
+    elif args.dtype == "fp32":
+        dtype = torch.float32
+    else:
+        raise ValueError(f"Unsupported dtype: {args.dtype}")
+
+    if args.transformer:
+        converted_transformer_state_dict = convert_cogview3_transformer_checkpoint_to_diffusers(original_ckpt)
+        transformer = CogView3PlusTransformer2DModel()
+        transformer.load_state_dict(converted_transformer_state_dict, strict=True)
+
+        print(f"Saving CogView3 Transformer in Diffusers format in {args.output_path}/transformer")
+        transformer.to(dtype).save_pretrained(f"{args.output_path}/transformer")
+
+    if len(original_ckpt) > 0:
+        print(f"Warning: {len(original_ckpt)} keys were not converted and will be saved as is: {original_ckpt.keys()}")
+
+
+if __name__ == "__main__":
+    main(args)
@@ -0,0 +1,91 @@
+import torch
+from diffusers.loaders.single_file_utils import convert_ldm_vae_checkpoint
+from diffusers import AutoencoderKL
+from huggingface_hub import hf_hub_download
+from sgm.models.autoencoder import AutoencodingEngine
+
+# (1) create vae_sat
+# AutoencodingEngine initialization arguments:
+encoder_config={'target': 'sgm.modules.diffusionmodules.model.Encoder', 'params': {'attn_type': 'vanilla', 'double_z': True, 'z_channels': 16, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 4, 8, 8], 'num_res_blocks': 3, 'attn_resolutions': [], 'mid_attn': False, 'dropout': 0.0}}
+decoder_config={'target': 'sgm.modules.diffusionmodules.model.Decoder', 'params': {'attn_type': 'vanilla', 'double_z': True, 'z_channels': 16, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 4, 8, 8], 'num_res_blocks': 3, 'attn_resolutions': [], 'mid_attn': False, 'dropout': 0.0}}
+loss_config={'target': 'torch.nn.Identity'}
+regularizer_config={'target': 'sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer'}
+optimizer_config=None
+lr_g_factor=1.0
+ckpt_path="/raid/.cache/huggingface/models--ZP2HF--CogView3-SAT/snapshots/ca86ce9ba94f9a7f2dd109e7a59e4c8ad04121be/3plus_ae/imagekl_ch16.pt"
+ignore_keys= []
+kwargs = {"monitor": "val/rec_loss"}
+vae_sat = AutoencodingEngine(
+    encoder_config=encoder_config,
+    decoder_config=decoder_config,
+    loss_config=loss_config,
+    regularizer_config=regularizer_config,
+    optimizer_config=optimizer_config,
+    lr_g_factor=lr_g_factor,
+    ckpt_path=ckpt_path,
+    ignore_keys=ignore_keys,
+    **kwargs)
+
+
+
+# (2) create vae (diffusers)
+ckpt_path_vae_cogview3 = hf_hub_download(repo_id="ZP2HF/CogView3-SAT", subfolder="3plus_ae", filename="imagekl_ch16.pt")
+cogview3_ckpt = torch.load(ckpt_path_vae_cogview3, map_location='cpu')["state_dict"]
+
+in_channels = 3  # Inferred from encoder.conv_in.weight shape
+out_channels = 3  # Inferred from decoder.conv_out.weight shape
+down_block_types = ("DownEncoderBlock2D",) * 4  # Inferred from the presence of 4 encoder.down blocks
+up_block_types = ("UpDecoderBlock2D",) * 4  # Inferred from the presence of 4 decoder.up blocks
+block_out_channels = (128, 512, 1024, 1024)  # Inferred from the channel sizes in encoder.down blocks
+layers_per_block = 3  # Inferred from the number of blocks in each encoder.down and decoder.up
+act_fn = "silu" # This is the default, cannot be inferred from state_dict
+latent_channels = 16  # Inferred from decoder.conv_in.weight shape
+norm_num_groups = 32  # This is the default, cannot be inferred from state_dict
+sample_size = 1024  # This is the default, cannot be inferred from state_dict
+scaling_factor = 0.18215  # This is the default, cannot be inferred from state_dict
+force_upcast = True  # This is the default, cannot be inferred from state_dict
+use_quant_conv = False  # Inferred from the presence of encoder.conv_out
+use_post_quant_conv = False  # Inferred from the presence of decoder.conv_in
+mid_block_add_attention = False  # Inferred from the absence of attention layers in mid blocks
+
+vae = AutoencoderKL(
+    in_channels=in_channels,
+    out_channels=out_channels,
+    down_block_types=down_block_types,
+    up_block_types=up_block_types,
+    block_out_channels=block_out_channels,
+    layers_per_block=layers_per_block,
+    act_fn=act_fn,
+    latent_channels=latent_channels,
+    norm_num_groups=norm_num_groups,
+    sample_size=sample_size,
+    scaling_factor=scaling_factor,
+    force_upcast=force_upcast,
+    use_quant_conv=use_quant_conv,
+    use_post_quant_conv=use_post_quant_conv,
+    mid_block_add_attention=mid_block_add_attention,
+)
+
+vae.eval()
+vae_sat.eval()
+
+converted_vae_state_dict = convert_ldm_vae_checkpoint(cogview3_ckpt, vae.config)
+vae.load_state_dict(converted_vae_state_dict, strict=False)
+
+# (3) run forward pass for both models
+
+# [2, 16, 128, 128] -> [2, 3, 1024, 1024
+z = torch.load("z.pt").float().to("cpu")
+
+with torch.no_grad():
+    print(" ")
+    print(f" running forward pass for diffusers vae")
+    out = vae.decode(z).sample
+    print(f" ")
+    print(f" running forward pass for sgm vae")
+    out_sat = vae_sat.decode(z)
+
+print(f" output shape: {out.shape}")
+print(f" expected output shape: {out_sat.shape}")
+assert out.shape == out_sat.shape
+assert (out - out_sat).abs().max() < 1e-4, f"max diff: {(out - out_sat).abs().max()}"
@@ -0,0 +1,25 @@
+import torch
+from diffusers import CogView3PlusTransformer2DModel
+
+model = CogView3PlusTransformer2DModel.from_pretrained("/share/home/zyx/Models/CogView3Plus_hf/transformer",torch_dtype=torch.bfloat16)
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+
+batch_size = 1
+hidden_states = torch.ones((batch_size, 16, 256, 256), device=device, dtype=torch.bfloat16)
+timestep = torch.full((batch_size,), 999.0, device=device, dtype=torch.bfloat16)
+y = torch.ones((batch_size, 1536), device=device, dtype=torch.bfloat16)
+
+# 模拟调用 forward 方法
+outputs = model(
+    hidden_states=hidden_states,  # hidden_states 输入
+    timestep=timestep,  # timestep 输入
+    y=y,  # 标签输入
+    block_controlnet_hidden_states=None,  # 如果不需要，可以忽略
+    return_dict=True,  # 保持默认值
+    target_size=[(2048, 2048)],
+)
+
+# 输出模型结果
+print("Output shape:", outputs.sample.shape)
@@ -84,6 +84,7 @@
             "AutoencoderOobleck",
             "AutoencoderTiny",
             "CogVideoXTransformer3DModel",
+            "CogView3PlusTransformer2DModel",
             "ConsistencyDecoderVAE",
             "ControlNetModel",
             "ControlNetXSAdapter",
@@ -258,6 +259,7 @@
             "CogVideoXImageToVideoPipeline",
             "CogVideoXPipeline",
             "CogVideoXVideoToVideoPipeline",
+            "CogView3PlusPipeline",
             "CycleDiffusionPipeline",
             "FluxControlNetImg2ImgPipeline",
             "FluxControlNetInpaintPipeline",
@@ -558,6 +560,7 @@
             AutoencoderOobleck,
             AutoencoderTiny,
             CogVideoXTransformer3DModel,
+            CogView3PlusTransformer2DModel,
             ConsistencyDecoderVAE,
             ControlNetModel,
             ControlNetXSAdapter,
@@ -710,6 +713,7 @@
             CogVideoXImageToVideoPipeline,
             CogVideoXPipeline,
             CogVideoXVideoToVideoPipeline,
+            CogView3PlusPipeline,
             CycleDiffusionPipeline,
             FluxControlNetImg2ImgPipeline,
             FluxControlNetInpaintPipeline,
 
@@ -54,6 +54,7 @@
     _import_structure["transformers.stable_audio_transformer"] = ["StableAudioDiTModel"]
     _import_structure["transformers.t5_film_transformer"] = ["T5FilmDecoder"]
     _import_structure["transformers.transformer_2d"] = ["Transformer2DModel"]
+    _import_structure["transformers.transformer_cogview3plus"] = ["CogView3PlusTransformer2DModel"]
     _import_structure["transformers.transformer_flux"] = ["FluxTransformer2DModel"]
     _import_structure["transformers.transformer_sd3"] = ["SD3Transformer2DModel"]
     _import_structure["transformers.transformer_temporal"] = ["TransformerTemporalModel"]
@@ -98,6 +99,7 @@
         from .transformers import (
             AuraFlowTransformer2DModel,
             CogVideoXTransformer3DModel,
+            CogView3PlusTransformer2DModel,
             DiTTransformer2DModel,
             DualTransformer2DModel,
             FluxTransformer2DModel,
 
@@ -122,6 +122,7 @@ def __init__(
         out_dim: int = None,
         context_pre_only=None,
         pre_only=False,
+        layrnorm_elementwise_affine: bool = True,
     ):
         super().__init__()
 
@@ -179,8 +180,8 @@ def __init__(
             self.norm_q = None
             self.norm_k = None
         elif qk_norm == "layer_norm":
-            self.norm_q = nn.LayerNorm(dim_head, eps=eps)
-            self.norm_k = nn.LayerNorm(dim_head, eps=eps)
+            self.norm_q = nn.LayerNorm(dim_head, eps=eps, elementwise_affine=layrnorm_elementwise_affine)
+            self.norm_k = nn.LayerNorm(dim_head, eps=eps, elementwise_affine=layrnorm_elementwise_affine)
         elif qk_norm == "fp32_layer_norm":
             self.norm_q = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
             self.norm_k = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)