Spaces:

Yuxihenry
/

SpatialTrackerV2

Running on Zero

SpatialTrackerV2 / models /SpaTrackV2 /utils /embeddings.py

xiaoyuxi

vggt_da

e43b66a 6 months ago

8.57 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.

	# This source code is licensed under the license found in the
	# LICENSE file in the root directory of this source tree.

	import torch
	import numpy as np

	def get_3d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
	"""
	grid_size: int of the grid height and width
	return:
	pos_embed: [grid_sizegrid_size, embed_dim] or [1+grid_sizegrid_size, embed_dim] (w/ or w/o cls_token)
	"""
	if isinstance(grid_size, tuple):
	grid_size_h, grid_size_w = grid_size
	else:
	grid_size_h = grid_size_w = grid_size
	grid_h = np.arange(grid_size_h, dtype=np.float32)
	grid_w = np.arange(grid_size_w, dtype=np.float32)
	grid = np.meshgrid(grid_w, grid_h) # here w goes first
	grid = np.stack(grid, axis=0)

	grid = grid.reshape([2, 1, grid_size_h, grid_size_w])
	pos_embed = get_3d_sincos_pos_embed_from_grid(embed_dim, grid)
	if cls_token and extra_tokens > 0:
	pos_embed = np.concatenate(
	[np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0
	)
	return pos_embed


	def get_3d_sincos_pos_embed_from_grid(embed_dim, grid):
	assert embed_dim % 3 == 0

	# use half of dimensions to encode grid_h
	B, S, N, _ = grid.shape
	gridx = grid[..., 0].view(BSN).detach().cpu().numpy()
	gridy = grid[..., 1].view(BSN).detach().cpu().numpy()
	gridz = grid[..., 2].view(BSN).detach().cpu().numpy()
	emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, gridx) # (N, D/3)
	emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, gridy) # (N, D/3)
	emb_z = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, gridz) # (N, D/3)

	emb = np.concatenate([emb_h, emb_w, emb_z], axis=1) # (N, D)
	emb = torch.from_numpy(emb).to(grid.device)
	return emb.view(B, S, N, embed_dim)


	def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
	"""
	grid_size: int of the grid height and width
	return:
	pos_embed: [grid_sizegrid_size, embed_dim] or [1+grid_sizegrid_size, embed_dim] (w/ or w/o cls_token)
	"""
	if isinstance(grid_size, tuple):
	grid_size_h, grid_size_w = grid_size
	else:
	grid_size_h = grid_size_w = grid_size
	grid_h = np.arange(grid_size_h, dtype=np.float32)
	grid_w = np.arange(grid_size_w, dtype=np.float32)
	grid = np.meshgrid(grid_w, grid_h) # here w goes first
	grid = np.stack(grid, axis=0)

	grid = grid.reshape([2, 1, grid_size_h, grid_size_w])
	pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
	if cls_token and extra_tokens > 0:
	pos_embed = np.concatenate(
	[np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0
	)
	return pos_embed


	def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
	assert embed_dim % 2 == 0

	# use half of dimensions to encode grid_h
	emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
	emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)

	emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
	return emb


	def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
	"""
	embed_dim: output dimension for each position
	pos: a list of positions to be encoded: size (M,)
	out: (M, D)
	"""
	assert embed_dim % 2 == 0
	omega = np.arange(embed_dim // 2, dtype=np.float64)
	omega /= embed_dim / 2.0
	omega = 1.0 / 10000 ** omega # (D/2,)
	pos = pos.reshape(-1) # (M,)
	out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product

	emb_sin = np.sin(out) # (M, D/2)
	emb_cos = np.cos(out) # (M, D/2)

	emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
	return emb


	def get_2d_embedding(xy, C, cat_coords=True):
	B, N, D = xy.shape
	assert D == 2

	x = xy[:, :, 0:1]
	y = xy[:, :, 1:2]
	div_term = (
	torch.arange(0, C, 2, device=xy.device, dtype=torch.float32) * (1000.0 / C)
	).reshape(1, 1, int(C / 2))

	pe_x = torch.zeros(B, N, C, device=xy.device, dtype=torch.float32)
	pe_y = torch.zeros(B, N, C, device=xy.device, dtype=torch.float32)

	pe_x[:, :, 0::2] = torch.sin(x * div_term)
	pe_x[:, :, 1::2] = torch.cos(x * div_term)

	pe_y[:, :, 0::2] = torch.sin(y * div_term)
	pe_y[:, :, 1::2] = torch.cos(y * div_term)

	pe = torch.cat([pe_x, pe_y], dim=2) # B, N, C*3
	if cat_coords:
	pe = torch.cat([xy, pe], dim=2) # B, N, C*3+3
	return pe


	def get_3d_embedding(xyz, C, cat_coords=True):
	B, N, D = xyz.shape
	assert D == 3

	x = xyz[:, :, 0:1]
	y = xyz[:, :, 1:2]
	z = xyz[:, :, 2:3]
	div_term = (
	torch.arange(0, C, 2, device=xyz.device, dtype=torch.float32) * (1000.0 / C)
	).reshape(1, 1, int(C / 2))

	pe_x = torch.zeros(B, N, C, device=xyz.device, dtype=torch.float32)
	pe_y = torch.zeros(B, N, C, device=xyz.device, dtype=torch.float32)
	pe_z = torch.zeros(B, N, C, device=xyz.device, dtype=torch.float32)

	pe_x[:, :, 0::2] = torch.sin(x * div_term)
	pe_x[:, :, 1::2] = torch.cos(x * div_term)

	pe_y[:, :, 0::2] = torch.sin(y * div_term)
	pe_y[:, :, 1::2] = torch.cos(y * div_term)

	pe_z[:, :, 0::2] = torch.sin(z * div_term)
	pe_z[:, :, 1::2] = torch.cos(z * div_term)

	pe = torch.cat([pe_x, pe_y, pe_z], dim=2) # B, N, C*3
	if cat_coords:
	pe = torch.cat([pe, xyz], dim=2) # B, N, C*3+3
	return pe


	def get_4d_embedding(xyzw, C, cat_coords=True):
	B, N, D = xyzw.shape
	assert D == 4

	x = xyzw[:, :, 0:1]
	y = xyzw[:, :, 1:2]
	z = xyzw[:, :, 2:3]
	w = xyzw[:, :, 3:4]
	div_term = (
	torch.arange(0, C, 2, device=xyzw.device, dtype=torch.float32) * (1000.0 / C)
	).reshape(1, 1, int(C / 2))

	pe_x = torch.zeros(B, N, C, device=xyzw.device, dtype=torch.float32)
	pe_y = torch.zeros(B, N, C, device=xyzw.device, dtype=torch.float32)
	pe_z = torch.zeros(B, N, C, device=xyzw.device, dtype=torch.float32)
	pe_w = torch.zeros(B, N, C, device=xyzw.device, dtype=torch.float32)

	pe_x[:, :, 0::2] = torch.sin(x * div_term)
	pe_x[:, :, 1::2] = torch.cos(x * div_term)

	pe_y[:, :, 0::2] = torch.sin(y * div_term)
	pe_y[:, :, 1::2] = torch.cos(y * div_term)

	pe_z[:, :, 0::2] = torch.sin(z * div_term)
	pe_z[:, :, 1::2] = torch.cos(z * div_term)

	pe_w[:, :, 0::2] = torch.sin(w * div_term)
	pe_w[:, :, 1::2] = torch.cos(w * div_term)

	pe = torch.cat([pe_x, pe_y, pe_z, pe_w], dim=2) # B, N, C*3
	if cat_coords:
	pe = torch.cat([pe, xyzw], dim=2) # B, N, C*3+3
	return pe

	import torch.nn as nn
	class Embedder_Fourier(nn.Module):
	def __init__(self, input_dim, max_freq_log2, N_freqs,
	log_sampling=True, include_input=True,
	periodic_fns=(torch.sin, torch.cos)):
	'''
	:param input_dim: dimension of input to be embedded
	:param max_freq_log2: log2 of max freq; min freq is 1 by default
	:param N_freqs: number of frequency bands
	:param log_sampling: if True, frequency bands are linerly sampled in log-space
	:param include_input: if True, raw input is included in the embedding
	:param periodic_fns: periodic functions used to embed input
	'''
	super(Embedder_Fourier, self).__init__()

	self.input_dim = input_dim
	self.include_input = include_input
	self.periodic_fns = periodic_fns

	self.out_dim = 0
	if self.include_input:
	self.out_dim += self.input_dim

	self.out_dim += self.input_dim * N_freqs * len(self.periodic_fns)

	if log_sampling:
	self.freq_bands = 2. ** torch.linspace(0., max_freq_log2, N_freqs)
	else:
	self.freq_bands = torch.linspace(
	2. 0., 2. max_freq_log2, N_freqs)

	self.freq_bands = self.freq_bands.numpy().tolist()

	def forward(self,
	input: torch.Tensor,
	rescale: float = 1.0):
	'''
	:param input: tensor of shape [..., self.input_dim]
	:return: tensor of shape [..., self.out_dim]
	'''
	assert (input.shape[-1] == self.input_dim)
	out = []
	if self.include_input:
	out.append(input/rescale)

	for i in range(len(self.freq_bands)):
	freq = self.freq_bands[i]
	for p_fn in self.periodic_fns:
	out.append(p_fn(input * freq))
	out = torch.cat(out, dim=-1)

	assert (out.shape[-1] == self.out_dim)
	return out