Spaces:
Running
on
Zero
Running
on
Zero
| import os.path as osp | |
| import os | |
| import sys | |
| import json | |
| import itertools | |
| import time | |
| from collections import deque | |
| import torch | |
| import tqdm | |
| import concurrent.futures | |
| import psutil | |
| import io | |
| import cv2 | |
| from PIL import Image | |
| import numpy as np | |
| from models.SpaTrackV2.models.utils import matrix_to_quaternion | |
| from models.SpaTrackV2.datasets.base_sfm_dataset import BaseSfMViewDataset | |
| from models.SpaTrackV2.models.utils import ( | |
| camera_to_pose_encoding, pose_encoding_to_camera | |
| ) | |
| from models.SpaTrackV2.models.camera_transform import normalize_cameras | |
| from models.SpaTrackV2.datasets.tartan_utils.traj_tf import ned2cam | |
| from models.SpaTrackV2.datasets.tartan_utils.cam_tf import pos_quats2SE_matrices | |
| from models.SpaTrackV2.utils.visualizer import Visualizer | |
| import glob | |
| from models.SpaTrackV2.datasets.dataset_util import ( | |
| imread_cv2, npz_loader, read_video,npy_loader,resize_crop_video | |
| ) | |
| from scipy.ndimage import median_filter | |
| class PointOdy(BaseSfMViewDataset): | |
| def __init__(self, *args, ROOT, **kwargs): | |
| self.ROOT = ROOT | |
| super().__init__(*args, **kwargs) | |
| scene_list = os.listdir(self.ROOT) | |
| self.exception_scene = ["/mnt/bn/haotongdata/Datasets/pointodyssey_processed/train/r3_new_f", | |
| "/mnt/bn/haotongdata/Datasets/pointodyssey_processed/train/r6_new_f", "/mnt/bn/haotongdata/Datasets/pointodyssey_processed/train/cnb_dlab_0215_3rd"] | |
| self.scene_list = [osp.join(self.ROOT, scene) for scene in scene_list if os.path.isdir(osp.join(self.ROOT, scene))] | |
| self.scene_list = [scene for scene in self.scene_list if scene not in self.exception_scene] | |
| def __len__(self): | |
| return len(self.scene_list) | |
| def _get_views(self, idx, resolution, rng): | |
| #TODO: remove this | |
| scene = self.scene_list[idx] | |
| # scene = "/mnt/bn/haotongdata/Datasets/pointodyssey_processed/train/scene_d78_0318_ego2" | |
| imgs_pool = sorted(glob.glob(osp.join(scene, '*.jpg'))) | |
| T = len(imgs_pool) | |
| # randomly choose a scene | |
| sclae_num = int(np.random.uniform(2, 3)) | |
| start = np.random.choice(np.arange(0, max(T - sclae_num*self.num_views, 1))) | |
| idxs = np.arange(start, start+sclae_num*self.num_views, sclae_num).clip(0, T-1) | |
| images_pick = np.array(imgs_pool)[idxs] | |
| # get the all attributes | |
| extrs = [] | |
| rgbs = [] | |
| depths = [] | |
| intrs = [] | |
| tracks3d = [] | |
| tracks2d = [] | |
| visbs = [] | |
| for i, img_dir_i in enumerate(images_pick): | |
| img_dir = img_dir_i | |
| depth_dir = img_dir.replace("jpg", "png") | |
| meta_dir = img_dir.replace("jpg", "npz") | |
| # load rgb and depth | |
| rgb = imread_cv2(img_dir) | |
| depth = imread_cv2(depth_dir, cv2.IMREAD_UNCHANGED)/ 65535.0 * 1000.0 | |
| rgbs.append(rgb) | |
| depths.append(depth) | |
| # load pose | |
| meta = dict(np.load(meta_dir, allow_pickle=True)) | |
| extr_i = np.eye(4) | |
| extr_i[:3,:3] = meta['R_cam2world'] | |
| extr_i[:3,3] = meta['t_cam2world'] | |
| extrs.append(extr_i) | |
| intrs.append(meta['intrinsics']) | |
| tracks3d.append(meta['traj_3d']) | |
| tracks2d.append(meta['traj_2d']) | |
| visbs.append(meta['visib']) | |
| rgbs = np.stack(rgbs, axis=0) | |
| depths = np.stack(depths, axis=0) | |
| extrs = np.stack(extrs, axis=0) | |
| intrs = np.stack(intrs, axis=0) | |
| tracks3d = np.stack(tracks3d, axis=0) | |
| tracks2d = np.stack(tracks2d, axis=0) | |
| visbs = np.stack(visbs, axis=0) | |
| # convert BGR to RGB | |
| T, H, W, _ = rgbs.shape | |
| # convert them into numpy array | |
| vis = visbs | |
| mask_track = vis.sum(axis=0) > T // 3 | |
| tracks3d = tracks3d[:,mask_track,:] | |
| tracks2d = tracks2d[:,mask_track,:] | |
| vis = vis[:,mask_track] | |
| # randomly pick self.track_num // 2 points | |
| if tracks3d.shape[1] > self.track_num // 2: | |
| # idxs_p = rng.choice(tracks3d.shape[1], self.track_num // 2, replace=True) | |
| # traj_3d = tracks3d[:,idxs_p,:] | |
| # traj_2d = tracks2d[:,idxs_p,:] | |
| # vis = vis[:,idxs_p] | |
| traj_3d = tracks3d | |
| traj_2d = tracks2d | |
| vis = vis | |
| else: | |
| traj_2d = np.zeros((self.num_views, self.track_num // 2, 2)) | |
| traj_3d = np.zeros((self.num_views, self.track_num // 2, 3)) | |
| vis = np.zeros((self.num_views, self.track_num // 2)) | |
| if traj_3d.shape[-1] != 3: | |
| print("The shape of traj_3d is not correct") | |
| traj_2d = np.zeros((self.num_views, self.track_num // 2, 2)) | |
| traj_3d = np.zeros((self.num_views, self.track_num // 2, 3)) | |
| vis = np.zeros((self.num_views, self.track_num // 2)) | |
| # if np.random.choice([True, False]): | |
| # rgbs = rgbs[::-1].copy() | |
| # depths = depths[::-1].copy() | |
| # poses = extrinsics[::-1].copy() | |
| # intrinsics = intrinsics[::-1].copy() | |
| # traj_2d = traj_2d[::-1].copy() | |
| # traj_3d = traj_3d[::-1].copy() | |
| # vis = vis[::-1].copy() | |
| # else: | |
| poses = extrs.copy() | |
| # get tensor track | |
| traj_2d = torch.from_numpy(traj_2d) | |
| traj_3d = torch.from_numpy(traj_3d) | |
| vis = torch.from_numpy(vis) | |
| # crop and resize | |
| rgbs, depths, Intrs = resize_crop_video(rgbs, depths, intrs, resolution[0]) | |
| # update the visibility | |
| if traj_3d.sum() != 0: | |
| traj_3d_one = torch.cat([traj_3d, torch.ones(traj_3d.shape[0], traj_3d.shape[1],1)], dim=-1) | |
| traj_3d_cam = torch.einsum('tbc,tnc->tnb', | |
| torch.from_numpy(poses).float(), traj_3d_one) | |
| traj_3d_cam = traj_3d_cam[:, :, :3] | |
| traj_2d_proj = torch.einsum('tbc,tnc->tnb', | |
| Intrs, traj_3d_cam/ (traj_3d_cam[:,:,2:3].abs())) | |
| H_, W_ = rgbs.shape[-2:] | |
| in_scope = (traj_2d_proj[..., 0] > 0) & (traj_2d_proj[..., 0] < W_) & (traj_2d_proj[..., 1] > 0) & (traj_2d_proj[..., 1] < H_) | |
| vis = vis & in_scope | |
| traj_3d[...,:2] = traj_2d_proj[...,:2] | |
| traj_3d[..., 2] = traj_3d_cam[...,2] | |
| # filter the invisible points | |
| mask_vis = vis.sum(dim=0) > 0 | |
| traj_3d = traj_3d[:, mask_vis] | |
| vis = vis[:, mask_vis] | |
| # pick fixed number of points | |
| if traj_3d.shape[1] < self.track_num // 2: | |
| traj_3d = torch.zeros(self.num_views, self.track_num // 2, 3) | |
| vis = torch.zeros(self.num_views, self.track_num // 2) | |
| else: | |
| idxs_p = rng.choice(traj_3d.shape[1], self.track_num // 2, replace=False) | |
| traj_3d = traj_3d[:, idxs_p] | |
| vis = vis[:, idxs_p] | |
| # encode the camera poses | |
| Extrs = torch.from_numpy(poses) | |
| camera_poses = torch.inverse(Extrs) #NOTE: C2W | |
| focal0 = Intrs[:, 0, 0] / resolution[0] | |
| focal1 = Intrs[:, 1, 1] / resolution[0] | |
| focal = (focal0.unsqueeze(1)+focal1.unsqueeze(1))/2 | |
| # first frame normalize | |
| camera_poses = torch.inverse(camera_poses[:1]) @ camera_poses | |
| T_center = camera_poses[:, :3, 3].mean(dim=0) | |
| Radius = (camera_poses[:, :3, 3].norm(dim=1).max()) | |
| # if Radius < 1e-2: | |
| Radius = 1 | |
| camera_poses[:, :3, 3] = (camera_poses[:, :3, 3])/Radius | |
| R = camera_poses[:, :3, :3] | |
| t = camera_poses[:, :3, 3] | |
| rot_vec = matrix_to_quaternion(R) | |
| pose_enc = torch.cat([t, rot_vec, focal], dim=1) | |
| # depth_cano = Radius*focal[:,:,None,None] / depths.clamp(min=1e-6) | |
| depth_cano = depths / Radius | |
| traj_3d[..., 2] = traj_3d[..., 2] / Radius | |
| depth_cano[depth_cano==torch.nan] = 0 | |
| syn_real = torch.tensor([1]) | |
| metric_rel = torch.tensor([1]) | |
| static = torch.tensor([0]) | |
| data_dir = scene | |
| views = dict( | |
| rgbs=rgbs, | |
| depths=depth_cano, | |
| pose_enc=pose_enc, | |
| traj_mat=camera_poses, | |
| intrs=Intrs, | |
| traj_3d=traj_3d, | |
| vis=vis, | |
| syn_real=syn_real, | |
| metric_rel=metric_rel, | |
| static=static, | |
| data_dir=data_dir | |
| ) | |
| return views | |
| if __name__ == "__main__": | |
| from models.videocam.datasets.base_sfm_dataset import view_name | |
| from functools import partial | |
| DATA_DIR = "/mnt/bn/haotongdata/Datasets/pointodyssey_processed/train/" | |
| dataset = PointOdy(split='train', ROOT=DATA_DIR, | |
| resolution=518, aug_crop=16, num_views=48) | |
| rng = np.random.default_rng(seed=0) | |
| data_ret = dataset._get_views(52,(518,518),rng) | |
| # check the 2d tracking vis | |
| viser = Visualizer(save_dir=".", grayscale=False, | |
| fps=10, pad_value=0, tracks_leave_trace=5) | |
| viser.visualize(video=data_ret["rgbs"][None], | |
| tracks=data_ret["traj_3d"][None,..., :2], | |
| visibility=data_ret["vis"][None], filename="test") | |
| # check the 4d visualization | |
| from models.videocam.datasets.vis3d_check import vis4d | |
| vis4d(data_ret["rgbs"], data_ret["depths"], | |
| data_ret["traj_mat"], data_ret["intrs"], track3d=data_ret["traj_3d"], | |
| workspace="/mnt/bn/xyxdata/home/codes/my_projs/SpaTrack2/viser_result/test") | |
| import pdb; pdb.set_trace() |