Merge pull request #2 from a-r-r-o-w/latte-2

maxin-cn · web-flow · commit 7988119bbdb2 · 2024-07-11T16:23:01.000+10:00
update _toctree.yml for docs and fix example
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -249,6 +249,8 @@
       title: DiTTransformer2DModel
     - local: api/models/hunyuan_transformer2d
       title: HunyuanDiT2DModel
+    - local: api/models/latte_transformer3d
+      title: LatteTransformer3DModel
     - local: api/models/lumina_nextdit2d
       title: LuminaNextDiT2DModel
     - local: api/models/transformer_temporal
diff --git a/docs/source/en/api/models/latte_transformer3d.md b/docs/source/en/api/models/latte_transformer3d.md
@@ -17,4 +17,3 @@ A Diffusion Transformer model for 3D data from [Latte](https://github.com/Vchite
 ## LatteTransformer3DModel
 
 [[autodoc]] LatteTransformer3DModel
-
diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
@@ -35,10 +35,21 @@ def get_timestep_embedding(
     """
     This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
 
-    :param timesteps: a 1-D Tensor of N indices, one per batch element.
-                      These may be fractional.
-    :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
-    embeddings. :return: an [N x dim] Tensor of positional embeddings.
+    Args
+        timesteps (torch.Tensor):
+            a 1-D Tensor of N indices, one per batch element. These may be fractional.
+        embedding_dim (int):
+            the dimension of the output.
+        flip_sin_to_cos (bool):
+            Whether the embedding order should be `cos, sin` (if True) or `sin, cos` (if False)
+        downscale_freq_shift (float):
+            Controls the delta between frequencies between dimensions
+        scale (float):
+            Scaling factor applied to the embeddings.
+        max_period (int):
+            Controls the maximum frequency of the embeddings
+    Returns
+        torch.Tensor: an [N x dim] Tensor of positional embeddings.
     """
     assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
 
diff --git a/src/diffusers/pipelines/latte/pipeline_latte.py b/src/diffusers/pipelines/latte/pipeline_latte.py
@@ -56,12 +56,12 @@
         >>> from diffusers.utils import export_to_gif
 
         >>> # You can replace the checkpoint id with "maxin-cn/Latte-1" too.
-        >>> pipe = LattePipeline.from_pretrained("maxin-cn/Latte-1", torch_dtype=torch.float16)
+        >>> pipe = LattePipeline.from_pretrained("maxin-cn/Latte-1", torch_dtype=torch.float16).to("cuda")
         >>> # Enable memory optimizations.
         >>> pipe.enable_model_cpu_offload()
 
         >>> prompt = "A small cactus with a happy face in the Sahara desert."
-        >>> videos = pipe(prompt).frames
+        >>> videos = pipe(prompt).frames[0]
         >>> export_to_gif(videos, "latte.gif")
         ```
 """
@@ -576,7 +576,7 @@ def prepare_latents(
         # scale the initial noise by the standard deviation required by the scheduler
         latents = latents * self.scheduler.init_noise_sigma
         return latents
-    
+
     @property
     def guidance_scale(self):
         return self._guidance_scale
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -677,7 +677,7 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
-class LDMTextToImagePipeline(metaclass=DummyObject):
+class LattePipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
     def __init__(self, *args, **kwargs):
@@ -692,7 +692,7 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
-class LEditsPPPipelineStableDiffusion(metaclass=DummyObject):
+class LDMTextToImagePipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
     def __init__(self, *args, **kwargs):
@@ -707,7 +707,7 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
-class LEditsPPPipelineStableDiffusionXL(metaclass=DummyObject):
+class LEditsPPPipelineStableDiffusion(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
     def __init__(self, *args, **kwargs):
@@ -722,7 +722,7 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
-class LattePipeline(metaclass=DummyObject):
+class LEditsPPPipelineStableDiffusionXL(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
     def __init__(self, *args, **kwargs):
diff --git a/tests/pipelines/latte/test_latte.py b/tests/pipelines/latte/test_latte.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import gc
+import inspect
 import tempfile
 import unittest
 
@@ -38,7 +39,6 @@
 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
 from ..test_pipelines_common import PipelineTesterMixin, to_np
 
-import inspect
 
 enable_full_determinism()
 

Original file line number	Diff line number	Diff line change
`@@ -17,4 +17,3 @@ A Diffusion Transformer model for 3D data from [Latte](https://github.com/Vchite`
`17`	`17`	`## LatteTransformer3DModel`
`18`	`18`
`19`	`19`	`[[autodoc]] LatteTransformer3DModel`
`20`		`-`