Spaces:

VideoSys
/

CogVideoX

Running on Zero

App Files Files Community

CogVideoX / videosys /models /modules /embeddings.py

oahzxl

update

ab7be96 23 days ago

raw

history blame contribute delete

No virus

15.6 kB

	import functools
	import math
	from typing import Optional, Tuple, Union

	import numpy as np
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torch.utils.checkpoint
	from einops import rearrange
	from timm.models.vision_transformer import Mlp


	class CogVideoXPatchEmbed(nn.Module):
	def __init__(
	self,
	patch_size: int = 2,
	in_channels: int = 16,
	embed_dim: int = 1920,
	text_embed_dim: int = 4096,
	bias: bool = True,
	) -> None:
	super().__init__()
	self.patch_size = patch_size

	self.proj = nn.Conv2d(
	in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias
	)
	self.text_proj = nn.Linear(text_embed_dim, embed_dim)

	def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor):
	r"""
	Args:
	text_embeds (`torch.Tensor`):
	Input text embeddings. Expected shape: (batch_size, seq_length, embedding_dim).
	image_embeds (`torch.Tensor`):
	Input image embeddings. Expected shape: (batch_size, num_frames, channels, height, width).
	"""
	text_embeds = self.text_proj(text_embeds)

	batch, num_frames, channels, height, width = image_embeds.shape
	image_embeds = image_embeds.reshape(-1, channels, height, width)
	image_embeds = self.proj(image_embeds)
	image_embeds = image_embeds.view(batch, num_frames, *image_embeds.shape[1:])
	image_embeds = image_embeds.flatten(3).transpose(2, 3) # [batch, num_frames, height x width, channels]
	image_embeds = image_embeds.flatten(1, 2) # [batch, num_frames x height x width, channels]

	embeds = torch.cat(
	[text_embeds, image_embeds], dim=1
	).contiguous() # [batch, seq_length + num_frames x height x width, channels]
	return embeds


	class OpenSoraPatchEmbed3D(nn.Module):
	"""Video to Patch Embedding.

	Args:
	patch_size (int): Patch token size. Default: (2,4,4).
	in_chans (int): Number of input video channels. Default: 3.
	embed_dim (int): Number of linear projection output channels. Default: 96.
	norm_layer (nn.Module, optional): Normalization layer. Default: None
	"""

	def __init__(
	self,
	patch_size=(2, 4, 4),
	in_chans=3,
	embed_dim=96,
	norm_layer=None,
	flatten=True,
	):
	super().__init__()
	self.patch_size = patch_size
	self.flatten = flatten

	self.in_chans = in_chans
	self.embed_dim = embed_dim

	self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
	if norm_layer is not None:
	self.norm = norm_layer(embed_dim)
	else:
	self.norm = None

	def forward(self, x):
	"""Forward function."""
	# padding
	_, _, D, H, W = x.size()
	if W % self.patch_size[2] != 0:
	x = F.pad(x, (0, self.patch_size[2] - W % self.patch_size[2]))
	if H % self.patch_size[1] != 0:
	x = F.pad(x, (0, 0, 0, self.patch_size[1] - H % self.patch_size[1]))
	if D % self.patch_size[0] != 0:
	x = F.pad(x, (0, 0, 0, 0, 0, self.patch_size[0] - D % self.patch_size[0]))

	x = self.proj(x) # (B C T H W)
	if self.norm is not None:
	D, Wh, Ww = x.size(2), x.size(3), x.size(4)
	x = x.flatten(2).transpose(1, 2)
	x = self.norm(x)
	x = x.transpose(1, 2).view(-1, self.embed_dim, D, Wh, Ww)
	if self.flatten:
	x = x.flatten(2).transpose(1, 2) # BCTHW -> BNC
	return x


	class TimestepEmbedder(nn.Module):
	"""
	Embeds scalar timesteps into vector representations.
	"""

	def __init__(self, hidden_size, frequency_embedding_size=256):
	super().__init__()
	self.mlp = nn.Sequential(
	nn.Linear(frequency_embedding_size, hidden_size, bias=True),
	nn.SiLU(),
	nn.Linear(hidden_size, hidden_size, bias=True),
	)
	self.frequency_embedding_size = frequency_embedding_size

	@staticmethod
	def timestep_embedding(t, dim, max_period=10000):
	"""
	Create sinusoidal timestep embeddings.
	:param t: a 1-D Tensor of N indices, one per batch element.
	These may be fractional.
	:param dim: the dimension of the output.
	:param max_period: controls the minimum frequency of the embeddings.
	:return: an (N, D) Tensor of positional embeddings.
	"""
	# https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
	half = dim // 2
	freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half)
	freqs = freqs.to(device=t.device)
	args = t[:, None].float() * freqs[None]
	embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
	if dim % 2:
	embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
	return embedding

	def forward(self, t, dtype):
	t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
	if t_freq.dtype != dtype:
	t_freq = t_freq.to(dtype)
	t_emb = self.mlp(t_freq)
	return t_emb


	class SizeEmbedder(TimestepEmbedder):
	"""
	Embeds scalar timesteps into vector representations.
	"""

	def __init__(self, hidden_size, frequency_embedding_size=256):
	super().__init__(hidden_size=hidden_size, frequency_embedding_size=frequency_embedding_size)
	self.mlp = nn.Sequential(
	nn.Linear(frequency_embedding_size, hidden_size, bias=True),
	nn.SiLU(),
	nn.Linear(hidden_size, hidden_size, bias=True),
	)
	self.frequency_embedding_size = frequency_embedding_size
	self.outdim = hidden_size

	def forward(self, s, bs):
	if s.ndim == 1:
	s = s[:, None]
	assert s.ndim == 2
	if s.shape[0] != bs:
	s = s.repeat(bs // s.shape[0], 1)
	assert s.shape[0] == bs
	b, dims = s.shape[0], s.shape[1]
	s = rearrange(s, "b d -> (b d)")
	s_freq = self.timestep_embedding(s, self.frequency_embedding_size).to(self.dtype)
	s_emb = self.mlp(s_freq)
	s_emb = rearrange(s_emb, "(b d) d2 -> b (d d2)", b=b, d=dims, d2=self.outdim)
	return s_emb

	@property
	def dtype(self):
	return next(self.parameters()).dtype


	class OpenSoraCaptionEmbedder(nn.Module):
	"""
	Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
	"""

	def __init__(
	self,
	in_channels,
	hidden_size,
	uncond_prob,
	act_layer=nn.GELU(approximate="tanh"),
	token_num=120,
	):
	super().__init__()
	self.y_proj = Mlp(
	in_features=in_channels,
	hidden_features=hidden_size,
	out_features=hidden_size,
	act_layer=act_layer,
	drop=0,
	)
	self.register_buffer(
	"y_embedding",
	torch.randn(token_num, in_channels) / in_channels**0.5,
	)
	self.uncond_prob = uncond_prob

	def token_drop(self, caption, force_drop_ids=None):
	"""
	Drops labels to enable classifier-free guidance.
	"""
	if force_drop_ids is None:
	drop_ids = torch.rand(caption.shape[0]).cuda() < self.uncond_prob
	else:
	drop_ids = force_drop_ids == 1
	caption = torch.where(drop_ids[:, None, None, None], self.y_embedding, caption)
	return caption

	def forward(self, caption, train, force_drop_ids=None):
	if train:
	assert caption.shape[2:] == self.y_embedding.shape
	use_dropout = self.uncond_prob > 0
	if (train and use_dropout) or (force_drop_ids is not None):
	caption = self.token_drop(caption, force_drop_ids)
	caption = self.y_proj(caption)
	return caption


	class OpenSoraPositionEmbedding2D(nn.Module):
	def __init__(self, dim: int) -> None:
	super().__init__()
	self.dim = dim
	assert dim % 4 == 0, "dim must be divisible by 4"
	half_dim = dim // 2
	inv_freq = 1.0 / (10000 ** (torch.arange(0, half_dim, 2).float() / half_dim))
	self.register_buffer("inv_freq", inv_freq, persistent=False)

	def _get_sin_cos_emb(self, t: torch.Tensor):
	out = torch.einsum("i,d->id", t, self.inv_freq)
	emb_cos = torch.cos(out)
	emb_sin = torch.sin(out)
	return torch.cat((emb_sin, emb_cos), dim=-1)

	@functools.lru_cache(maxsize=512)
	def _get_cached_emb(
	self,
	device: torch.device,
	dtype: torch.dtype,
	h: int,
	w: int,
	scale: float = 1.0,
	base_size: Optional[int] = None,
	):
	grid_h = torch.arange(h, device=device) / scale
	grid_w = torch.arange(w, device=device) / scale
	if base_size is not None:
	grid_h *= base_size / h
	grid_w *= base_size / w
	grid_h, grid_w = torch.meshgrid(
	grid_w,
	grid_h,
	indexing="ij",
	) # here w goes first
	grid_h = grid_h.t().reshape(-1)
	grid_w = grid_w.t().reshape(-1)
	emb_h = self._get_sin_cos_emb(grid_h)
	emb_w = self._get_sin_cos_emb(grid_w)
	return torch.concat([emb_h, emb_w], dim=-1).unsqueeze(0).to(dtype)

	def forward(
	self,
	x: torch.Tensor,
	h: int,
	w: int,
	scale: Optional[float] = 1.0,
	base_size: Optional[int] = None,
	) -> torch.Tensor:
	return self._get_cached_emb(x.device, x.dtype, h, w, scale, base_size)


	def get_3d_rotary_pos_embed(
	embed_dim, crops_coords, grid_size, temporal_size, theta: int = 10000, use_real: bool = True
	) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
	"""
	RoPE for video tokens with 3D structure.

	Args:
	embed_dim: (`int`):
	The embedding dimension size, corresponding to hidden_size_head.
	crops_coords (`Tuple[int]`):
	The top-left and bottom-right coordinates of the crop.
	grid_size (`Tuple[int]`):
	The grid size of the spatial positional embedding (height, width).
	temporal_size (`int`):
	The size of the temporal dimension.
	theta (`float`):
	Scaling factor for frequency computation.
	use_real (`bool`):
	If True, return real part and imaginary part separately. Otherwise, return complex numbers.

	Returns:
	`torch.Tensor`: positional embedding with shape `(temporal_size * grid_size[0] * grid_size[1], embed_dim/2)`.
	"""
	start, stop = crops_coords
	grid_h = np.linspace(start[0], stop[0], grid_size[0], endpoint=False, dtype=np.float32)
	grid_w = np.linspace(start[1], stop[1], grid_size[1], endpoint=False, dtype=np.float32)
	grid_t = np.linspace(0, temporal_size, temporal_size, endpoint=False, dtype=np.float32)

	# Compute dimensions for each axis
	dim_t = embed_dim // 4
	dim_h = embed_dim // 8 * 3
	dim_w = embed_dim // 8 * 3

	# Temporal frequencies
	freqs_t = 1.0 / (theta ** (torch.arange(0, dim_t, 2).float() / dim_t))
	grid_t = torch.from_numpy(grid_t).float()
	freqs_t = torch.einsum("n , f -> n f", grid_t, freqs_t)
	freqs_t = freqs_t.repeat_interleave(2, dim=-1)

	# Spatial frequencies for height and width
	freqs_h = 1.0 / (theta ** (torch.arange(0, dim_h, 2).float() / dim_h))
	freqs_w = 1.0 / (theta ** (torch.arange(0, dim_w, 2).float() / dim_w))
	grid_h = torch.from_numpy(grid_h).float()
	grid_w = torch.from_numpy(grid_w).float()
	freqs_h = torch.einsum("n , f -> n f", grid_h, freqs_h)
	freqs_w = torch.einsum("n , f -> n f", grid_w, freqs_w)
	freqs_h = freqs_h.repeat_interleave(2, dim=-1)
	freqs_w = freqs_w.repeat_interleave(2, dim=-1)

	# Broadcast and concatenate tensors along specified dimension
	def broadcast(tensors, dim=-1):
	num_tensors = len(tensors)
	shape_lens = {len(t.shape) for t in tensors}
	assert len(shape_lens) == 1, "tensors must all have the same number of dimensions"
	shape_len = list(shape_lens)[0]
	dim = (dim + shape_len) if dim < 0 else dim
	dims = list(zip(*(list(t.shape) for t in tensors)))
	expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
	assert all(
	[*(len(set(t[1])) <= 2 for t in expandable_dims)]
	), "invalid dimensions for broadcastable concatenation"
	max_dims = [(t[0], max(t[1])) for t in expandable_dims]
	expanded_dims = [(t[0], (t[1],) * num_tensors) for t in max_dims]
	expanded_dims.insert(dim, (dim, dims[dim]))
	expandable_shapes = list(zip(*(t[1] for t in expanded_dims)))
	tensors = [t[0].expand(*t[1]) for t in zip(tensors, expandable_shapes)]
	return torch.cat(tensors, dim=dim)

	freqs = broadcast((freqs_t[:, None, None, :], freqs_h[None, :, None, :], freqs_w[None, None, :, :]), dim=-1)

	t, h, w, d = freqs.shape
	freqs = freqs.view(t * h * w, d)

	# Generate sine and cosine components
	sin = freqs.sin()
	cos = freqs.cos()

	if use_real:
	return cos, sin
	else:
	freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
	return freqs_cis


	def apply_rotary_emb(
	x: torch.Tensor,
	freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
	use_real: bool = True,
	use_real_unbind_dim: int = -1,
	) -> Tuple[torch.Tensor, torch.Tensor]:
	"""
	Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
	to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
	reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
	tensors contain rotary embeddings and are returned as real tensors.

	Args:
	x (`torch.Tensor`):
	Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
	freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)

	Returns:
	Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
	"""
	if use_real:
	cos, sin = freqs_cis # [S, D]
	cos = cos[None, None]
	sin = sin[None, None]
	cos, sin = cos.to(x.device), sin.to(x.device)

	if use_real_unbind_dim == -1:
	# Use for example in Lumina
	x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1) # [B, S, H, D//2]
	x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
	elif use_real_unbind_dim == -2:
	# Use for example in Stable Audio
	x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2) # [B, S, H, D//2]
	x_rotated = torch.cat([-x_imag, x_real], dim=-1)
	else:
	raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")

	out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)

	return out
	else:
	x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
	freqs_cis = freqs_cis.unsqueeze(2)
	x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)

	return x_out.type_as(x)