Spaces:

Yuxihenry
/

SpatialTrackerV2

Running on Zero

App Files Files Community

xiaoyuxi commited on Jun 24

Commit

852c062

1 Parent(s): baffcce

backend

Browse files

Files changed (1) hide show

app.py +1018 -702

app.py CHANGED Viewed

@@ -1,412 +1,161 @@
 import gradio as gr
 import os
-import sys
-import logging
-import time
-import uuid
-import atexit
-from concurrent.futures import ThreadPoolExecutor
-from typing import Union, List, Tuple, Dict, Any
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# Import spaces for ZeroGPU support
-try:
-    import spaces
-except ImportError:
-    # Fallback for local development
-    def spaces(func):
-        return func
-# Import other dependencies
-import subprocess
-import cv2
-import numpy as np
-import threading
-import tempfile
-import shutil
-import glob
 import json
 import base64
-import struct
-import zlib
-import argparse
-import socket
-import gc
-from pathlib import Path
-from einops import rearrange
-from tempfile import TemporaryDirectory
-from http.server import SimpleHTTPRequestHandler
-from socketserver import ThreadingTCPServer
-import socketserver
-import http.server
-import torch
-from huggingface_hub import hf_hub_download
-# Import custom modules with error handling
-try:
-    from app_3rd.sam_utils.inference import SamPredictor, get_sam_predictor, run_inference
-    from app_3rd.spatrack_utils.infer_track import get_tracker_predictor, run_tracker, get_points_on_a_grid
-except ImportError as e:
-    logger.error(f"Failed to import custom modules: {e}")
-    raise
-MAX_FRAMES = 80
-try:
-    import vggt
-except:
-    subprocess.run(["pip", "install", "-e", "./models/vggt"], check=True)
-    sys.path.append("/home/user/app/models/vggt")
-# init the model
-os.environ["VGGT_DIR"] = hf_hub_download("facebook/VGGT-1B", "model.pt")
-if os.environ.get("VGGT_DIR", None) is not None:
-    from vggt.models.vggt import VGGT
-    from vggt.utils.load_fn import preprocess_image
-    from vggt.utils.pose_enc import pose_encoding_to_extri_intri
-    vggt_model = VGGT()
-    vggt_model.load_state_dict(torch.load(os.environ.get("VGGT_DIR")))
-    vggt_model.eval()
-    vggt_model = vggt_model.to("cuda")
-# Global model initialization
-print("🚀 Initializing global models...")
-def init_global_models():
-    """Initialize global models (CPU only for ZeroGPU compatibility)"""
     try:
-        print("🔧 Loading SAM predictor...")
-        sam_predictor = get_sam_predictor()
-        print("✅ SAM predictor loaded successfully")
-        # Keep on CPU for ZeroGPU - will be moved to GPU in the decorated function
-        print("🔧 Loading tracker models...")
-        out_dir = os.path.join("temp_init", "results")
-        os.makedirs(out_dir, exist_ok=True)
-        tracker_model, tracker_viser = get_tracker_predictor(out_dir, vo_points=756)
-        print("✅ Tracker models loaded successfully")
-        # Keep on CPU for ZeroGPU - will be moved to GPU in the decorated function
-        print("✅ All models initialized successfully!")
-        return True
     except Exception as e:
-        print(f"❌ Error initializing models: {e}")
-        import traceback
-        traceback.print_exc()
         return False
-# Initialize models at startup
-# Thread pool for delayed deletion
-thread_pool_executor = ThreadPoolExecutor(max_workers=2)
-def delete_later(path: Union[str, os.PathLike], delay: int = 600):
-    """Delete file or directory after specified delay (default 10 minutes)"""
-    def _delete():
-        try:
-            if os.path.isfile(path):
-                os.remove(path)
-            elif os.path.isdir(path):
-                shutil.rmtree(path)
-        except Exception as e:
-            logger.warning(f"Failed to delete {path}: {e}")
-    def _wait_and_delete():
-        time.sleep(delay)
-        _delete()
-    thread_pool_executor.submit(_wait_and_delete)
-    atexit.register(_delete)
-def create_user_temp_dir():
-    """Create a unique temporary directory for each user session"""
-    session_id = str(uuid.uuid4())[:8]  # Short unique ID
-    temp_dir = os.path.join("temp", f"session_{session_id}")
-    os.makedirs(temp_dir, exist_ok=True)
-    # Schedule deletion after 10 minutes
-    delete_later(temp_dir, delay=600)
-    return temp_dir
-# Wrap the core GPU functions with @spaces.GPU
-@spaces.GPU
-def gpu_run_sam(image, points, boxes):
-    """GPU-accelerated SAM inference"""
-    # Initialize SAM predictor inside GPU function
-    predictor = get_sam_predictor()
-    # Ensure predictor is on GPU - handle different SAM predictor types
     try:
-        if hasattr(predictor, 'model'):
-            # For transformers SAM
-            predictor.model = predictor.model.cuda()
-        elif hasattr(predictor, 'sam'):
-            # For segment-anything SAM
-            predictor.sam = predictor.sam.cuda()
-        elif hasattr(predictor, 'to'):
-            # Generic PyTorch model
-            predictor = predictor.to('cuda')
-        # Also ensure image is on the right device if it's a tensor
-        if hasattr(image, 'cuda'):
-            image = image.cuda()
-    except Exception as e:
-        print(f"Warning: Could not move predictor to GPU: {e}")
-    return run_inference(predictor, image, points, boxes)
-@spaces.GPU
-def gpu_run_tracker(temp_dir, video_name, grid_size, vo_points, fps):
-    """GPU-accelerated tracking"""
-    import torchvision.transforms as T
-    import decord
-    # Initialize tracker model inside GPU function
-    out_dir = os.path.join(temp_dir, "results")
-    os.makedirs(out_dir, exist_ok=True)
-    tracker_model, tracker_viser = get_tracker_predictor(out_dir, vo_points=vo_points)
-    # Setup paths
-    video_path = os.path.join(temp_dir, f"{video_name}.mp4")
-    mask_path = os.path.join(temp_dir, f"{video_name}.png")
-    out_dir = os.path.join(temp_dir, "results")
-    os.makedirs(out_dir, exist_ok=True)
-    # Load video using decord
-    video_reader = decord.VideoReader(video_path)
-    video_tensor = torch.from_numpy(video_reader.get_batch(range(len(video_reader))).asnumpy()).permute(0, 3, 1, 2)  # Convert to tensor and permute to (N, C, H, W)
-    # resize make sure the shortest side is 336
-    h, w = video_tensor.shape[2:]
-    scale = max(224 / h, 224 / w)
-    if scale < 1:
-        new_h, new_w = int(h * scale), int(w * scale)
-        video_tensor = T.Resize((new_h, new_w))(video_tensor)
-    video_tensor = video_tensor[::fps].float()[:MAX_FRAMES]
-    # Move video tensor to GPU
-    video_tensor = video_tensor.cuda()
-    print(f"Video tensor shape: {video_tensor.shape}, device: {video_tensor.device}")
-    depth_tensor = None
-    intrs = None
-    extrs = None
-    data_npz_load = {}
-    # run vggt
-    if os.environ.get("VGGT_DIR", None) is not None:
-        # process the image tensor
-        video_tensor = preprocess_image(video_tensor)[None]
-        with torch.no_grad():
-            with torch.cuda.amp.autocast(dtype=torch.bfloat16):
-                # Predict attributes including cameras, depth maps, and point maps.
-                aggregated_tokens_list, ps_idx = vggt_model.aggregator(video_tensor.cuda()/255)
-                pose_enc = vggt_model.camera_head(aggregated_tokens_list)[-1]
-                # Extrinsic and intrinsic matrices, following OpenCV convention (camera from world)
-                extrinsic, intrinsic = pose_encoding_to_extri_intri(pose_enc, video_tensor.shape[-2:])
-                # Predict Depth Maps
-                depth_map, depth_conf = vggt_model.depth_head(aggregated_tokens_list, video_tensor.cuda()/255, ps_idx)
-        depth_tensor = depth_map.squeeze().cpu().numpy()
-        extrs = np.eye(4)[None].repeat(len(depth_tensor), axis=0)
-        extrs[:, :3, :4] = extrinsic.squeeze().cpu().numpy()
-        intrs = intrinsic.squeeze().cpu().numpy()
-        video_tensor = video_tensor.squeeze()
-        #NOTE: 20% of the depth is not reliable
-        threshold = depth_conf.squeeze()[0].view(-1).quantile(0.6).item()
-        unc_metric = depth_conf.squeeze().cpu().numpy() > threshold
-    # Load and process mask
-    if os.path.exists(mask_path):
-        mask = cv2.imread(mask_path)
-        mask = cv2.resize(mask, (video_tensor.shape[3], video_tensor.shape[2]))
-        mask = mask.sum(axis=-1)>0
-    else:
-        mask = np.ones_like(video_tensor[0,0].cpu().numpy())>0
-        grid_size = 10
-    # Get frame dimensions and create grid points
-    frame_H, frame_W = video_tensor.shape[2:]
-    grid_pts = get_points_on_a_grid(grid_size, (frame_H, frame_W), device="cuda")  # Create on GPU
-    # Sample mask values at grid points and filter out points where mask=0
-    if os.path.exists(mask_path):
-        grid_pts_int = grid_pts[0].long()
-        mask_values = mask[grid_pts_int.cpu()[...,1], grid_pts_int.cpu()[...,0]]
-        grid_pts = grid_pts[:, mask_values]
-    query_xyt = torch.cat([torch.zeros_like(grid_pts[:, :, :1]), grid_pts], dim=2)[0].cpu().numpy()
-    print(f"Query points shape: {query_xyt.shape}")
-    # Run model inference
-    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
-        (
-            c2w_traj, intrs, point_map, conf_depth,
-            track3d_pred, track2d_pred, vis_pred, conf_pred, video
-        ) = tracker_model.forward(video_tensor, depth=depth_tensor,
-                            intrs=intrs, extrs=extrs,
-                            queries=query_xyt,
-                            fps=1, full_point=False, iters_track=4,
-                            query_no_BA=True, fixed_cam=False, stage=1,
-                            support_frame=len(video_tensor)-1, replace_ratio=0.2)
-        # Resize results to avoid too large I/O Burden
-        max_size = 224
-        h, w = video.shape[2:]
-        scale = min(max_size / h, max_size / w)
-        if scale < 1:
-            new_h, new_w = int(h * scale), int(w * scale)
-            video = T.Resize((new_h, new_w))(video)
-            video_tensor = T.Resize((new_h, new_w))(video_tensor)
-            point_map = T.Resize((new_h, new_w))(point_map)
-            track2d_pred[...,:2] = track2d_pred[...,:2] * scale
-            intrs[:,:2,:] = intrs[:,:2,:] * scale
-            conf_depth = T.Resize((new_h, new_w))(conf_depth)
-        # Visualize tracks
-        tracker_viser.visualize(video=video[None],
-                        tracks=track2d_pred[None][...,:2],
-                        visibility=vis_pred[None],filename="test")
-        # Save in tapip3d format
-        data_npz_load["coords"] = (torch.einsum("tij,tnj->tni", c2w_traj[:,:3,:3], track3d_pred[:,:,:3].cpu()) + c2w_traj[:,:3,3][:,None,:]).numpy()
-        data_npz_load["extrinsics"] = torch.inverse(c2w_traj).cpu().numpy()
-        data_npz_load["intrinsics"] = intrs.cpu().numpy()
-        data_npz_load["depths"] = point_map[:,2,...].cpu().numpy()
-        data_npz_load["video"] = (video_tensor).cpu().numpy()/255
-        data_npz_load["visibs"] = vis_pred.cpu().numpy()
-        data_npz_load["confs"] = conf_pred.cpu().numpy()
-        data_npz_load["confs_depth"] = conf_depth.cpu().numpy()
-        np.savez(os.path.join(out_dir, f'result.npz'), **data_npz_load)
-    return os.path.join(out_dir, "result.npz"), os.path.join(out_dir, "test_pred_track.mp4")
-def compress_and_write(filename, header, blob):
-    header_bytes = json.dumps(header).encode("utf-8")
-    header_len = struct.pack("<I", len(header_bytes))
-    with open(filename, "wb") as f:
-        f.write(header_len)
-        f.write(header_bytes)
-        f.write(blob)
-def process_point_cloud_data(npz_file, width=256, height=192, fps=4):
-    fixed_size = (width, height)
-    data = np.load(npz_file)
-    extrinsics = data["extrinsics"]
-    intrinsics = data["intrinsics"]
-    trajs = data["coords"]
-    T, C, H, W = data["video"].shape
-    fx = intrinsics[0, 0, 0]
-    fy = intrinsics[0, 1, 1]
-    fov_y = 2 * np.arctan(H / (2 * fy)) * (180 / np.pi)
-    fov_x = 2 * np.arctan(W / (2 * fx)) * (180 / np.pi)
-    original_aspect_ratio = (W / fx) / (H / fy)
-    rgb_video = (rearrange(data["video"], "T C H W -> T H W C") * 255).astype(np.uint8)
-    rgb_video = np.stack([cv2.resize(frame, fixed_size, interpolation=cv2.INTER_AREA)
-                          for frame in rgb_video])
-    depth_video = data["depths"].astype(np.float32)
-    if "confs_depth" in data.keys():
-        confs = (data["confs_depth"].astype(np.float32) > 0.5).astype(np.float32)
-        depth_video = depth_video * confs
-    depth_video = np.stack([cv2.resize(frame, fixed_size, interpolation=cv2.INTER_NEAREST)
-                            for frame in depth_video])
-    scale_x = fixed_size[0] / W
-    scale_y = fixed_size[1] / H
-    intrinsics = intrinsics.copy()
-    intrinsics[:, 0, :] *= scale_x
-    intrinsics[:, 1, :] *= scale_y
-    min_depth = float(depth_video.min()) * 0.8
-    max_depth = float(depth_video.max()) * 1.5
-    depth_normalized = (depth_video - min_depth) / (max_depth - min_depth)
-    depth_int = (depth_normalized * ((1 << 16) - 1)).astype(np.uint16)
-    depths_rgb = np.zeros((T, fixed_size[1], fixed_size[0], 3), dtype=np.uint8)
-    depths_rgb[:, :, :, 0] = (depth_int & 0xFF).astype(np.uint8)
-    depths_rgb[:, :, :, 1] = ((depth_int >> 8) & 0xFF).astype(np.uint8)
-    first_frame_inv = np.linalg.inv(extrinsics[0])
-    normalized_extrinsics = np.array([first_frame_inv @ ext for ext in extrinsics])
-    normalized_trajs = np.zeros_like(trajs)
-    for t in range(T):
-        homogeneous_trajs = np.concatenate([trajs[t], np.ones((trajs.shape[1], 1))], axis=1)
-        transformed_trajs = (first_frame_inv @ homogeneous_trajs.T).T
-        normalized_trajs[t] = transformed_trajs[:, :3]
-    arrays = {
-        "rgb_video": rgb_video,
-        "depths_rgb": depths_rgb,
-        "intrinsics": intrinsics,
-        "extrinsics": normalized_extrinsics,
-        "inv_extrinsics": np.linalg.inv(normalized_extrinsics),
-        "trajectories": normalized_trajs.astype(np.float32),
-        "cameraZ": 0.0
-    }
-    header = {}
-    blob_parts = []
-    offset = 0
-    for key, arr in arrays.items():
-        arr = np.ascontiguousarray(arr)
-        arr_bytes = arr.tobytes()
-        header[key] = {
-            "dtype": str(arr.dtype),
-            "shape": arr.shape,
-            "offset": offset,
-            "length": len(arr_bytes)
-        }
-        blob_parts.append(arr_bytes)
-        offset += len(arr_bytes)
-    raw_blob = b"".join(blob_parts)
-    compressed_blob = zlib.compress(raw_blob, level=9)
-    header["meta"] = {
-        "depthRange": [min_depth, max_depth],
-        "totalFrames": int(T),
-        "resolution": fixed_size,
-        "baseFrameRate": fps,
-        "numTrajectoryPoints": normalized_trajs.shape[1],
-        "fov": float(fov_y),
-        "fov_x": float(fov_x),
-        "original_aspect_ratio": float(original_aspect_ratio),
-        "fixed_aspect_ratio": float(fixed_size[0]/fixed_size[1])
-    }
-    # Create temporary file for compression
-    temp_data_file = f'./temp_data_{int(time.time())}.bin'
-    compress_and_write(temp_data_file, header, compressed_blob)
-    # Read the compressed data and encode to base64
-    with open(temp_data_file, "rb") as f:
-        encoded_blob = base64.b64encode(f.read()).decode("ascii")
-    # Clean up temporary file
-    os.unlink(temp_data_file)
-    # Read the HTML template and inject the base64 data
-    with open('./_viz/viz_template.html') as f:
-        html_template = f.read()
-    # Inject the base64 data into the HTML
-    html_content = html_template.replace(
-        "<head>",
-        f"<head>\n<script>window.embeddedBase64 = `{encoded_blob}`;</script>"
-    )
-    return html_content
 def numpy_to_base64(arr):
     """Convert numpy array to base64 string"""
@@ -416,367 +165,934 @@ def base64_to_numpy(b64_str, shape, dtype):
     """Convert base64 string back to numpy array"""
     return np.frombuffer(base64.b64decode(b64_str), dtype=dtype).reshape(shape)
 def get_video_name(video_path):
     """Extract video name without extension"""
     return os.path.splitext(os.path.basename(video_path))[0]
-# Backend API Functions
-def backend_upload_video(video_path: str) -> Dict[str, Any]:
-    """Backend API for video upload"""
     try:
-        # Create user-specific temporary directory
-        user_temp_dir = create_user_temp_dir()
-        # Get original video name
-        video_name = get_video_name(video_path)
-        temp_video_path = os.path.join(user_temp_dir, f"{video_name}.mp4")
-        shutil.copy(video_path, temp_video_path)
-        print(f"Video saved to: {temp_video_path}")
-        # Extract first frame
-        cap = cv2.VideoCapture(temp_video_path)
-        success, frame = cap.read()
         cap.release()
-        if not success:
-            return {"success": False, "error": "Failed to read video"}
-        # Resize frame to have minimum side length of 336
-        h, w = frame.shape[:2]
-        scale = 336 / min(h, w)
-        new_h, new_w = int(h * scale)//2*2, int(w * scale)//2*2
-        frame = cv2.resize(frame, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
-        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        # Convert frame to base64 string for storage, include temp_dir info
-        frame_data = {
-            'data': numpy_to_base64(frame),
-            'shape': frame.shape,
-            'dtype': str(frame.dtype),
-            'temp_dir': user_temp_dir  # Store temp directory path
-        }
-        return {
-            "success": True,
-            "original_image_state": json.dumps(frame_data),
-            "display_image": frame,
-            "selected_points": [],
-            "temp_dir": user_temp_dir
-        }
     except Exception as e:
-        logger.error(f"Error in backend_upload_video: {e}")
-        return {"success": False, "error": str(e)}
-def backend_select_point(original_img: str, sel_pix: list, point_type: str, point_x: int, point_y: int) -> Dict[str, Any]:
-    """Backend API for point selection"""
     try:
-        # Convert stored image data back to numpy array
-        frame_data = json.loads(original_img)
-        original_img = base64_to_numpy(frame_data['data'], frame_data['shape'], frame_data['dtype'])
-        temp_dir = frame_data.get('temp_dir', 'temp')  # Get user-specific temp dir
-        # Create a display image for visualization
-        display_img = original_img.copy()
-        # Create a new list instead of modifying the existing one
-        new_sel_pix = sel_pix.copy() if sel_pix else []
-        new_sel_pix.append(((point_x, point_y), 1 if point_type == 'positive_point' else 0))
-        # Run SAM inference
-        o_masks = gpu_run_sam(original_img, new_sel_pix, [])
-        # Draw points on display image
-        COLORS = [(0, 0, 255), (0, 255, 255)]  # BGR: Red for negative, Yellow for positive
-        MARKERS = [1, 5]  # Cross for negative, Star for positive
-        MARKER_SIZE = 8  # Increased marker size
-        for point, label in new_sel_pix:
-            cv2.drawMarker(display_img, point, COLORS[label], markerType=MARKERS[label], markerSize=MARKER_SIZE, thickness=2)
-        # Draw mask overlay on display image
-        if o_masks:
-            # Get the final mask (which is already processed as pos_mask - neg_mask)
-            mask = o_masks[0][0]  # Get first mask
-            # Create a light blue overlay
-            overlay = display_img.copy()
-            overlay[mask.squeeze()!=0] = [20, 60, 200]  # Light blue in BGR
-            # Blend with original image with lower alpha
-            display_img = cv2.addWeighted(overlay, 0.6, display_img, 0.4, 0)
-        # Save mask
-        if o_masks:
-            video_files = glob.glob(os.path.join(temp_dir, "*.mp4"))
-            if video_files:
-                video_name = get_video_name(video_files[0])
-                for mask, _ in o_masks:
-                    o_mask = np.uint8(mask.squeeze() * 255)
-                    o_file = os.path.join(temp_dir, f"{video_name}.png")
-                    cv2.imwrite(o_file, o_mask)
-        return {
-            "success": True,
-            "display_image": display_img,
-            "selected_points": new_sel_pix
-        }
     except Exception as e:
-        logger.error(f"Error in backend_select_point: {e}")
-        return {"success": False, "error": str(e)}
-def backend_reset_points(original_img: str, sel_pix: list) -> Dict[str, Any]:
-    """Backend API for resetting points"""
     try:
-        # Convert stored image data back to numpy array
-        frame_data = json.loads(original_img)
-        original_img = base64_to_numpy(frame_data['data'], frame_data['shape'], frame_data['dtype'])
-        temp_dir = frame_data.get('temp_dir', 'temp')  # Get user-specific temp dir
-        # Create a display image for visualization (just the original image)
-        display_img = original_img.copy()
-        # Clear all points
-        new_sel_pix = []
-        # Clear any existing masks in user's temp directory
-        for mask_file in glob.glob(os.path.join(temp_dir, "*.png")):
             try:
-                os.remove(mask_file)
             except Exception as e:
-                logger.warning(f"Failed to remove mask file {mask_file}: {e}")
-        return {
-            "success": True,
-            "display_image": display_img,
-            "selected_points": new_sel_pix
-        }
     except Exception as e:
-        logger.error(f"Error in backend_reset_points: {e}")
-        return {"success": False, "error": str(e)}
-def backend_run_tracker(grid_size: int, vo_points: int, fps: int, original_image_state: str) -> Dict[str, Any]:
-    """Backend API for running tracker and visualization"""
     try:
-        # Get user's temp directory from stored frame data
-        frame_data = json.loads(original_image_state)
-        temp_dir = frame_data.get('temp_dir', 'temp')
-        video_files = glob.glob(os.path.join(temp_dir, "*.mp4"))
-        if not video_files:
-            return {"success": False, "error": "No video file found"}
-        video_path = video_files[0]
-        video_name = get_video_name(video_path)
-        # Run tracker
-        npz_path, track2d_video = gpu_run_tracker(temp_dir, video_name, grid_size, vo_points, fps)
-        # Generate HTML content
-        html_content = process_point_cloud_data(npz_path)
-        # Schedule deletion of generated files
-        if os.path.exists(track2d_video):
-            delete_later(track2d_video, delay=600)
-        if os.path.exists(npz_path):
-            delete_later(npz_path, delay=600)
-        return {
-            "success": True,
-            "viz_html": html_content,
-            "track_video_path": track2d_video
-        }
     except Exception as e:
-        logger.error(f"Error in backend_run_tracker: {e}")
-        return {"success": False, "error": str(e)}
-# Remove the separate interfaces and create a unified API handler
-def unified_api_handler(function_type: str, *args) -> Dict[str, Any]:
-    """Unified API handler for all backend functions"""
     try:
-        if function_type == "upload_video":
-            # args[0] should be the video file
-            return backend_upload_video(args[0])
-        elif function_type == "select_point":
-            # args: original_img, sel_pix, point_type, point_x, point_y
-            return backend_select_point(args[0], args[1], args[2], args[3], args[4])
-        elif function_type == "reset_points":
-            # args: original_img, sel_pix
-            return backend_reset_points(args[0], args[1])
-        elif function_type == "run_tracker":
-            # args: grid_size, vo_points, fps, original_image_state
-            return backend_run_tracker(args[0], args[1], args[2], args[3])
         else:
-            return {"success": False, "error": f"Unknown function type: {function_type}"}
     except Exception as e:
-        logger.error(f"Error in unified_api_handler: {e}")
-        return {"success": False, "error": str(e)}
-# Create the main unified API interface
-main_api = gr.Interface(
-    fn=unified_api_handler,
-    inputs=[
-        gr.Dropdown(
-            choices=["upload_video", "select_point", "reset_points", "run_tracker"],
-            label="Function Type",
-            value="upload_video"
-        ),
-        gr.File(label="Video File (for upload_video)", file_types=[".mp4", ".avi", ".mov"]),
-        gr.Textbox(label="Original Image State", value=""),
-        gr.JSON(label="Selected Points", value=[]),
-        gr.Radio(choices=['positive_point', 'negative_point'], label="Point Type", value='positive_point'),
-        gr.Number(label="Point X", value=0),
-        gr.Number(label="Point Y", value=0),
-        gr.Number(label="Grid Size", value=50),
-        gr.Number(label="VO Points", value=756),
-        gr.Number(label="FPS", value=3)
-    ],
-    outputs=[
-        gr.JSON(label="Result")
-    ],
-    title="SpaTrackV2 Backend API",
-    description="Unified Backend API for SpaTrackV2. This is a private Space that provides core functionality.",
-    api_name="unified_api"
-)
-# Create additional interfaces for individual API functions for manual testing
-select_point_api = gr.Interface(
-    fn=backend_select_point,
-    inputs=[
-        gr.Textbox(label="Original Image State"),
-        gr.JSON(label="Selected Points"),
-        gr.Radio(choices=['positive_point', 'negative_point'], label="Point Type"),
-        gr.Number(label="Point X"),
-        gr.Number(label="Point Y")
-    ],
-    outputs=[
-        gr.JSON(label="Result")
-    ],
-    title="Select Point API",
-    description="API for selecting points on video frames"
-)
-reset_points_api = gr.Interface(
-    fn=backend_reset_points,
-    inputs=[
-        gr.Textbox(label="Original Image State"),
-        gr.JSON(label="Selected Points")
-    ],
-    outputs=[
-        gr.JSON(label="Result")
-    ],
-    title="Reset Points API",
-    description="API for resetting points"
-)
-tracker_api = gr.Interface(
-    fn=backend_run_tracker,
-    inputs=[
-        gr.Number(label="Grid Size", value=50),
-        gr.Number(label="VO Points", value=756),
-        gr.Number(label="FPS", value=3),
-        gr.Textbox(label="Original Image State")
-    ],
-    outputs=[
-        gr.JSON(label="Result")
-    ],
-    title="Run Tracker API",
-    description="API for running the tracking algorithm"
-)
-# Create a combined interface with tabs for manual testing
-with gr.Blocks(title="SpaTrackV2 Backend API") as backend_app:
-    gr.Markdown("# 🚀 SpaTrackV2 Backend API")
-    gr.Markdown("This is a private backend Space that provides core SpaTrackV2 functionality.")
-    with gr.Tabs():
-        with gr.TabItem("Unified API"):
-            main_api.render()
-        with gr.TabItem("Upload Video"):
-            upload_api = gr.Interface(
-                fn=backend_upload_video,
-                inputs=[gr.File(label="Upload Video", file_types=[".mp4", ".avi", ".mov"])],
-                outputs=[gr.JSON(label="Result")],
-                title="Upload Video API"
-            )
-            upload_api.render()
-        with gr.TabItem("Select Point"):
-            select_point_api.render()
-        with gr.TabItem("Reset Points"):
-            reset_points_api.render()
-        with gr.TabItem("Run Tracker"):
-            tracker_api.render()
-        with gr.TabItem("API Info"):
-            gr.Markdown("""
-            ## Available API Functions
-            ### Unified API
-            - **Function**: `unified_api_handler`
-            - **Input**: Function type + parameters
-            - **Output**: JSON result
-            ### Individual Functions
-            #### 1. Upload Video
-            - **Function**: `backend_upload_video`
-            - **Input**: Video file
-            - **Output**: Initial state and settings
-            #### 2. Select Point
-            - **Function**: `backend_select_point`
-            - **Input**: Image state + point coordinates
-            - **Output**: Updated image and points
-            #### 3. Reset Points
-            - **Function**: `backend_reset_points`
-            - **Input**: Image state + points
-            - **Output**: Reset image and empty points
-            #### 4. Run Tracker
-            - **Function**: `backend_run_tracker`
-            - **Input**: Parameters + image state
-            - **Output**: Visualization and tracking results
-            ### 5. GPU Functions
-            - `gpu_run_sam(image, points, boxes)`: GPU-accelerated SAM inference
-            - `gpu_run_tracker(temp_dir, video_name, grid_size, vo_points, fps)`: GPU-accelerated tracking
-            """)
-if __name__ == "__main__":
-    # Print startup information
-    print("🚀 Starting SpaTrackV2 Backend Space...")
-    print(f"🔧 Python version: {sys.version}")
-    print(f"🔧 Working directory: {os.getcwd()}")
-    print(f"🔧 GPU available: {torch.cuda.is_available()}")
-    if torch.cuda.is_available():
-        print(f"🔧 GPU device: {torch.cuda.get_device_name(0)}")
-        print(f"🔧 GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
-    print(f"🔧 Initializing models and GPU resources...")
-    # Initialize global models
-    init_success = init_global_models()
-    if init_success:
-        print("✅ Backend initialization complete!")
-    else:
-        print("❌ Backend initialization failed! Continuing with limited functionality...")
-    print("📡 Starting Gradio backend interface...")
-    print(f"🔧 Available GPU functions: {[name for name in globals() if name.startswith('gpu_')]}")
-    # Launch the complete backend app (not just main_api)
-    backend_app.launch(
         server_name="0.0.0.0",
         server_port=7860,
-        share=False,  # Backend shouldn't need sharing
         debug=True,
         show_error=True
     )

 import gradio as gr
 import os
 import json
+import numpy as np
+import cv2
 import base64
+import requests
+import time
+from typing import List, Tuple
+# Backend Space URL - replace with your actual backend space URL
+BACKEND_SPACE_URL = "Yuxihenry/SpatialTrackerV2_Backend"  # Replace with actual backend space URL
+hf_token = os.getenv("HF_TOKEN")  # Replace with your actual Hugging Face token
+# Debug information
+print(f"🔧 Environment Debug Info:")
+print(f"   - Backend URL: {BACKEND_SPACE_URL}")
+print(f"   - HF Token available: {'Yes' if hf_token else 'No'}")
+print(f"   - HF Token length: {len(hf_token) if hf_token else 0}")
+# Flag to track if backend is available
+BACKEND_AVAILABLE = False
+backend_client = None
+def check_user_permissions():
+    """Check if user has necessary permissions"""
+    print("🔐 Checking user permissions...")
+    if not hf_token:
+        print("❌ No HF Token found")
+        print("🔧 To get a token:")
+        print("   1. Go to https://huggingface.co/settings/tokens")
+        print("   2. Create a new token with 'read' permissions")
+        print("   3. Set it as environment variable: export HF_TOKEN='your_token'")
+        return False
+    # Try to access user info
     try:
+        headers = {'Authorization': f'Bearer {hf_token}'}
+        response = requests.get('https://huggingface.co/api/whoami', headers=headers, timeout=5)
+        if response.status_code == 200:
+            user_info = response.json()
+            username = user_info.get('name', 'Unknown')
+            print(f"✅ Authenticated as: {username}")
+            # Check if user has access to the specific space
+            space_url = f"https://huggingface.co/api/spaces/{BACKEND_SPACE_URL}"
+            space_response = requests.get(space_url, headers=headers, timeout=5)
+            if space_response.status_code == 200:
+                print("✅ You have access to the backend Space")
+                return True
+            elif space_response.status_code == 401:
+                print("❌ You don't have access to the backend Space")
+                print("🔧 Solutions:")
+                print("   1. Contact the Space owner to add you as collaborator")
+                print("   2. Ask the owner to make the Space public")
+                return False
+            elif space_response.status_code == 404:
+                print("❌ Backend Space not found")
+                print("🔧 Please check if the Space URL is correct")
+                return False
+            else:
+                print(f"⚠️  Unexpected response checking Space access: {space_response.status_code}")
+                return False
+        else:
+            print(f"❌ Token validation failed: {response.status_code}")
+            print("🔧 Your token might be invalid or expired")
+            return False
     except Exception as e:
+        print(f"❌ Error checking permissions: {e}")
         return False
+def check_backend_space_status():
+    """Check if backend space is running via HTTP request"""
     try:
+        backend_url = f"https://huggingface.co/spaces/{BACKEND_SPACE_URL}"
+        print(f"🔍 Checking backend space status: {backend_url}")
+        # Prepare headers with authentication if token is available
+        headers = {}
+        if hf_token:
+            headers['Authorization'] = f'Bearer {hf_token}'
+            print(f"🔐 Using HF Token for authentication")
+        # Try to access the space page
+        response = requests.get(backend_url, headers=headers, timeout=10)
+        if response.status_code == 200:
+            print("✅ Backend space page is accessible")
+            # Check if space is running (look for common indicators)
+            page_content = response.text.lower()
+            if "runtime error" in page_content:
+                print("❌ Backend space has runtime error")
+                return False
+            elif "building" in page_content:
+                print("🔄 Backend space is building...")
+                return False
+            elif "sleeping" in page_content:
+                print("😴 Backend space is sleeping")
+                return False
+            else:
+                print("✅ Backend space appears to be running")
+                return True
+        elif response.status_code == 401:
+            print("❌ Authentication failed (HTTP 401)")
+            print("🔧 This means:")
+            print("   - The backend Space is private")
+            print("   - Your HF Token doesn't have access to this Space")
+            print("   - You need to be added as a collaborator to the Space")
+            print("   - Or the Space owner needs to make it public")
+            return False
+        elif response.status_code == 404:
+            print("❌ Backend space not found (HTTP 404)")
+            print("🔧 Please check if the Space URL is correct:")
+            print(f"   Current URL: {BACKEND_SPACE_URL}")
+            return False
+        else:
+            print(f"❌ Backend space not accessible (HTTP {response.status_code})")
+            print(f"🔧 Response: {response.text[:200]}...")
+            return False
+    except requests.RequestException as e:
+        print(f"❌ Failed to check backend space status: {e}")
+        return False
+    except Exception as e:
+        print(f"❌ Unexpected error checking backend: {e}")
+        return False
+def initialize_backend():
+    """Initialize backend connection using gradio_client"""
+    global backend_client, BACKEND_AVAILABLE
+    try:
+        from gradio_client import Client
+        # Connect to HF Space
+        if hf_token:
+            backend_client = Client(BACKEND_SPACE_URL, hf_token=hf_token)
+        else:
+            backend_client = Client(BACKEND_SPACE_URL)
+        # Test the connection
+        backend_client.view_api()
+        BACKEND_AVAILABLE = True
+        return True
+    except Exception as e:
+        print(f"❌ Backend connection failed: {e}")
+        BACKEND_AVAILABLE = False
+        return False
 def numpy_to_base64(arr):
     """Convert numpy array to base64 string"""
     """Convert base64 string back to numpy array"""
     return np.frombuffer(base64.b64decode(b64_str), dtype=dtype).reshape(shape)
+def base64_to_image(b64_str):
+    """Convert base64 string to numpy image array"""
+    if not b64_str:
+        return None
+    try:
+        # Decode base64 to bytes
+        img_bytes = base64.b64decode(b64_str)
+        # Convert bytes to numpy array
+        nparr = np.frombuffer(img_bytes, np.uint8)
+        # Decode image
+        img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+        # Convert BGR to RGB
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        return img
+    except Exception as e:
+        print(f"Error converting base64 to image: {e}")
+        return None
 def get_video_name(video_path):
     """Extract video name without extension"""
     return os.path.splitext(os.path.basename(video_path))[0]
+def extract_first_frame(video_path):
+    """Extract first frame from video file"""
     try:
+        cap = cv2.VideoCapture(video_path)
+        ret, frame = cap.read()
         cap.release()
+        if ret:
+            # Convert BGR to RGB
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            return frame_rgb
+        else:
+            return None
     except Exception as e:
+        print(f"Error extracting first frame: {e}")
+        return None
+def handle_video_upload(video):
+    """Handle video upload and extract first frame"""
+    if video is None:
+        return None, None, [], 50, 756, 3
     try:
+        if BACKEND_AVAILABLE and backend_client:
+            # Try to use backend API
+            try:
+                print("🔧 Calling backend API for video upload...")
+                # Call the unified API with upload_video function type
+                result = backend_client.predict(
+                    "upload_video",  # function_type
+                    video,           # video file
+                    "",              # original_image_state (not used for upload)
+                    [],              # selected_points (not used for upload)
+                    "positive_point", # point_type (not used for upload)
+                    0,               # point_x (not used for upload)
+                    0,               # point_y (not used for upload)
+                    50,              # grid_size (not used for upload)
+                    756,             # vo_points (not used for upload)
+                    3,               # fps (not used for upload)
+                    api_name="/unified_api"
+                )
+                print(f"✅ Backend video upload API call successful!")
+                print(f"🔧 Result type: {type(result)}")
+                print(f"🔧 Result: {result}")
+                # Parse the result - expect a dict with success status
+                if isinstance(result, dict) and result.get("success"):
+                    # Extract data from backend response
+                    original_image_state = result.get("original_image_state", "")
+                    display_image = result.get("display_image", None)
+                    selected_points = result.get("selected_points", [])
+                    # Get video settings based on video name
+                    video_name = get_video_name(video)
+                    grid_size_val, vo_points_val, fps_val = get_video_settings(video_name)
+                    return original_image_state, display_image, selected_points, grid_size_val, vo_points_val, fps_val
+                else:
+                    print("Backend processing failed, using local fallback")
+                    # Fallback to local processing
+                    pass
+            except Exception as e:
+                print(f"Backend API call failed: {e}")
+                # Fallback to local processing
+                pass
+        # Fallback: local processing
+        print("Using local video processing...")
+        display_image = extract_first_frame(video)
+        if display_image is not None:
+            # Create a state format compatible with backend
+            import tempfile
+            import shutil
+            # Create a temporary directory for this session
+            session_id = str(int(time.time() * 1000))  # Use timestamp as session ID
+            temp_dir = os.path.join("temp_frontend", f"session_{session_id}")
+            os.makedirs(temp_dir, exist_ok=True)
+            # Copy video to temp directory with standardized name
+            video_name = get_video_name(video)
+            temp_video_path = os.path.join(temp_dir, f"{video_name}.mp4")
+            shutil.copy(video, temp_video_path)
+            # Create state format compatible with backend
+            frame_data = {
+                'data': numpy_to_base64(display_image),
+                'shape': display_image.shape,
+                'dtype': str(display_image.dtype),
+                'temp_dir': temp_dir,
+                'video_name': video_name,
+                'video_path': temp_video_path  # Keep for backward compatibility
+            }
+            original_image_state = json.dumps(frame_data)
+        else:
+            # Fallback to simple state if frame extraction fails
+            original_image_state = json.dumps({
+                "video_path": video,
+                "frame": "local_processing_failed"
+            })
+        # Get video settings
+        video_name = get_video_name(video)
+        grid_size_val, vo_points_val, fps_val = get_video_settings(video_name)
+        return original_image_state, display_image, [], grid_size_val, vo_points_val, fps_val
     except Exception as e:
+        print(f"Error in handle_video_upload: {e}")
+        return None, None, [], 50, 756, 3
+def select_point(original_img: str, sel_pix: list, point_type: str, evt: gr.SelectData):
+    """Handle point selection for SAM"""
+    if original_img is None:
+        return None, []
     try:
+        if BACKEND_AVAILABLE and backend_client:
+            # Try to use backend API
             try:
+                print(f"🔧 Calling backend select point API: x={evt.index[0]}, y={evt.index[1]}, type={point_type}")
+                # Call the unified API with select_point function type
+                result = backend_client.predict(
+                    "select_point",  # function_type
+                    None,            # video file (not used for select_point)
+                    original_img,    # original_image_state
+                    sel_pix,         # selected_points
+                    point_type,      # point_type
+                    evt.index[0],    # point_x
+                    evt.index[1],    # point_y
+                    50,              # grid_size (not used for select_point)
+                    756,             # vo_points (not used for select_point)
+                    3,               # fps (not used for select_point)
+                    api_name="/unified_api"
+                )
+                print(f"✅ Backend select point API call successful!")
+                print(f"🔧 Result type: {type(result)}")
+                print(f"🔧 Result: {result}")
+                # Parse the result - expect a dict with success status
+                if isinstance(result, dict) and result.get("success"):
+                    display_image = result.get("display_image", None)
+                    new_sel_pix = result.get("selected_points", sel_pix)
+                    return display_image, new_sel_pix
+                else:
+                    print("Backend processing failed, using local fallback")
+                    # Fallback to local processing
+                    pass
             except Exception as e:
+                print(f"Backend API call failed: {e}")
+                # Check for specific gradio_client errors
+                if "AppError" in str(type(e)):
+                    print("🔧 Backend Space has internal errors (AppError)")
+                    print("🔧 The backend Space code has bugs or configuration issues")
+                    print("🔧 Contact the Space owner to fix the backend implementation")
+                elif "Could not fetch config" in str(e):
+                    print("🔧 Config fetch failed - possible Gradio version mismatch")
+                    print("🔧 Frontend and backend may be using incompatible Gradio versions")
+                elif "timeout" in str(e).lower():
+                    print("🔧 Backend request timed out - Space might be overloaded")
+                else:
+                    print(f"🔧 Unexpected error type: {type(e).__name__}")
+                print("🔄 Showing error message instead of visualization...")
+                # Fallback to local processing
+                pass
+        # Fallback: local processing with improved visualization
+        print("Using local point selection with enhanced visualization...")
+        # Parse original image state
+        try:
+            state_data = json.loads(original_img)
+            video_path = state_data.get("video_path")
+        except:
+            video_path = None
+        if video_path:
+            # Re-extract frame and add point with mask visualization
+            display_image = extract_first_frame(video_path)
+            if display_image is not None:
+                # Add point to the image with enhanced visualization
+                x, y = evt.index[0], evt.index[1]
+                color = (0, 255, 0) if point_type == 'positive_point' else (255, 0, 0)
+                # Draw a larger, more visible point
+                cv2.circle(display_image, (x, y), 8, color, -1)
+                cv2.circle(display_image, (x, y), 12, (255, 255, 255), 2)
+                # Add point to selected points list
+                new_sel_pix = sel_pix.copy()
+                new_sel_pix.append([x, y, point_type])
+                return display_image, new_sel_pix
+        return None, []
     except Exception as e:
+        print(f"Error in select_point: {e}")
+        return None, []
+def reset_points(original_img: str, sel_pix):
+    """Reset points and restore original image"""
+    if original_img is None:
+        return None, []
     try:
+        if BACKEND_AVAILABLE and backend_client:
+            # Try to use backend API
+            try:
+                print("🔧 Calling backend reset points API...")
+                # Call the unified API with reset_points function type
+                result = backend_client.predict(
+                    "reset_points",  # function_type
+                    None,            # video file (not used for reset_points)
+                    original_img,    # original_image_state
+                    sel_pix,         # selected_points
+                    "positive_point", # point_type (not used for reset_points)
+                    0,               # point_x (not used for reset_points)
+                    0,               # point_y (not used for reset_points)
+                    50,              # grid_size (not used for reset_points)
+                    756,             # vo_points (not used for reset_points)
+                    3,               # fps (not used for reset_points)
+                    api_name="/unified_api"
+                )
+                print(f"✅ Backend reset points API call successful!")
+                print(f"🔧 Result: {result}")
+                # Parse the result
+                if isinstance(result, dict) and result.get("success"):
+                    display_image = result.get("display_image", None)
+                    new_sel_pix = result.get("selected_points", [])
+                    return display_image, new_sel_pix
+                else:
+                    print("Backend processing failed, using local fallback")
+                    # Fallback to local processing
+                    pass
+            except Exception as e:
+                print(f"Backend API call failed: {e}")
+                # Fallback to local processing
+                pass
+        # Fallback: local processing
+        print("Using local reset points...")
+        # Parse original image state
+        try:
+            state_data = json.loads(original_img)
+            video_path = state_data.get("video_path")
+        except:
+            video_path = None
+        if video_path:
+            # Re-extract original frame
+            display_image = extract_first_frame(video_path)
+            return display_image, []
+        return None, []
+    except Exception as e:
+        print(f"Error in reset_points: {e}")
+        return None, []
+def launch_viz(grid_size, vo_points, fps, original_image_state):
+    """Launch visualization with user-specific temp directory"""
+    if original_image_state is None:
+        return None, None
+    try:
+        if BACKEND_AVAILABLE and backend_client:
+            # Try to use backend API
+            try:
+                print(f"🔧 Calling backend API with parameters: grid_size={grid_size}, vo_points={vo_points}, fps={fps}")
+                print(f"🔧 Original image state type: {type(original_image_state)}")
+                print(f"🔧 Original image state preview: {str(original_image_state)[:100]}...")
+                # Validate and potentially fix the original_image_state format
+                state_to_send = original_image_state
+                # Check if this is a local processing state that needs to be converted
+                try:
+                    if isinstance(original_image_state, str):
+                        parsed_state = json.loads(original_image_state)
+                        if "video_path" in parsed_state and "frame" in parsed_state:
+                            # This is a local processing state, we need to handle differently
+                            print("🔧 Detected local processing state, cannot use backend for tracking")
+                            print("🔧 Backend requires proper video upload state from backend API")
+                            # Fall through to local processing
+                            raise ValueError("Local state cannot be processed by backend")
+                except json.JSONDecodeError:
+                    print("🔧 Invalid JSON state, cannot send to backend")
+                    raise ValueError("Invalid state format")
+                # Call the unified API with run_tracker function type
+                result = backend_client.predict(
+                    "run_tracker",        # function_type
+                    None,                 # video file (not used for run_tracker)
+                    state_to_send,        # original_image_state
+                    [],                   # selected_points (not used for run_tracker)
+                    "positive_point",     # point_type (not used for run_tracker)
+                    0,                    # point_x (not used for run_tracker)
+                    0,                    # point_y (not used for run_tracker)
+                    grid_size,            # grid_size
+                    vo_points,            # vo_points
+                    fps,                  # fps
+                    api_name="/unified_api"
+                )
+                print(f"✅ Backend API call successful!")
+                print(f"🔧 Result type: {type(result)}")
+                print(f"🔧 Result: {result}")
+                # Parse the result
+                if isinstance(result, dict) and result.get("success"):
+                    viz_html = result.get("viz_html", "")
+                    track_video_path = result.get("track_video_path", "")
+                    return viz_html, track_video_path
+                else:
+                    error_msg = result.get("error", "Unknown error") if isinstance(result, dict) else "Backend processing failed"
+                    print(f"❌ Backend processing failed: {error_msg}")
+                    # Fall through to error message
+                    pass
+            except Exception as e:
+                print(f"❌ Backend API call failed: {e}")
+                print(f"🔧 Error type: {type(e)}")
+                print(f"🔧 Error details: {str(e)}")
+                # Check for specific gradio_client errors
+                if "AppError" in str(type(e)):
+                    print("🔧 Backend Space has internal errors (AppError)")
+                    print("🔧 The backend Space code has bugs or configuration issues")
+                    print("🔧 Contact the Space owner to fix the backend implementation")
+                elif "Could not fetch config" in str(e):
+                    print("🔧 Config fetch failed - possible Gradio version mismatch")
+                    print("🔧 Frontend and backend may be using incompatible Gradio versions")
+                elif "timeout" in str(e).lower():
+                    print("🔧 Backend request timed out - Space might be overloaded")
+                elif "Expecting value" in str(e):
+                    print("🔧 JSON parsing error in backend - state format mismatch")
+                    print("🔧 This happens when using local processing state with backend API")
+                    print("🔧 Please upload video again to use backend processing")
+                else:
+                    print(f"🔧 Unexpected error type: {type(e).__name__}")
+                print("🔄 Showing error message instead of visualization...")
+                # Fall through to error message
+                pass
+        # Create an informative error message based on the state
+        state_info = ""
+        try:
+            if isinstance(original_image_state, str):
+                parsed_state = json.loads(original_image_state)
+                if "video_path" in parsed_state:
+                    video_name = os.path.basename(parsed_state["video_path"])
+                    state_info = f"Video: {video_name}"
+        except:
+            state_info = "State format unknown"
+        # Fallback: show message that backend is required
+        error_message = f"""
+        <div style='border: 3px solid #ff6b6b; border-radius: 10px; padding: 20px; background-color: #fff5f5;'>
+            <h3 style='color: #d63031; margin-bottom: 15px;'>⚠️ Backend Processing Required</h3>
+            <p style='color: #2d3436; line-height: 1.6;'>
+                The tracking and visualization features require backend processing. The current setup is using local processing which is incompatible with the backend API.
+            </p>
+            <h4 style='color: #d63031; margin: 15px 0 10px 0;'>Solutions:</h4>
+            <ul style='color: #2d3436; line-height: 1.6;'>
+                <li><strong>Upload video again:</strong> This will properly initialize the backend state</li>
+                <li><strong>Select points on the frame:</strong> Ensure you've clicked on the object to track</li>
+                <li><strong>Check backend connection:</strong> Ensure the backend Space is running</li>
+                <li><strong>Use compatible state:</strong> Avoid local processing mode</li>
+            </ul>
+            <div style='background-color: #f8f9fa; border-radius: 5px; padding: 10px; margin-top: 15px;'>
+                <p style='color: #2d3436; font-weight: bold; margin: 0 0 5px 0;'>Debug Information:</p>
+                <p style='color: #666; font-size: 12px; margin: 0;'>Backend Available: {BACKEND_AVAILABLE}</p>
+                <p style='color: #666; font-size: 12px; margin: 0;'>Backend Client: {backend_client is not None}</p>
+                <p style='color: #666; font-size: 12px; margin: 0;'>Backend URL: {BACKEND_SPACE_URL}</p>
+                <p style='color: #666; font-size: 12px; margin: 0;'>State Info: {state_info}</p>
+                <p style='color: #666; font-size: 12px; margin: 0;'>Processing Mode: {"Backend" if BACKEND_AVAILABLE else "Local (Limited)"}</p>
+            </div>
+            <div style='background-color: #e3f2fd; border-radius: 5px; padding: 10px; margin-top: 10px; border-left: 4px solid #2196f3;'>
+                <p style='color: #1976d2; font-weight: bold; margin: 0 0 5px 0;'>💡 Quick Fix:</p>
+                <p style='color: #1976d2; font-size: 13px; margin: 0;'>
+                    Try uploading your video again - this should properly initialize the backend state for tracking.
+                </p>
+            </div>
+        </div>
+        """
+        return error_message, None
     except Exception as e:
+        print(f"Error in launch_viz: {e}")
+        return None, None
+def clear_all():
+    """Clear all buffers and temporary files"""
+    return None, None, [], 50, 756, 3
+def update_tracker_model(model_name):
+    """Update tracker model (placeholder function)"""
+    return
+def get_video_settings(video_name):
+    """Get video-specific settings based on video name"""
+    video_settings = {
+        "kiss": (45, 700, 10),
+        "backpack": (40, 600, 2),
+        "kitchen": (60, 800, 3),
+        "pillow": (35, 500, 2),
+        "hockey": (45, 700, 2),
+        "drifting": (35, 1000, 6),
+        "ball": (45, 256, 6),
+        "ken_block_0": (45, 700, 2),
+        "ego_kc1": (45, 500, 4),
+        "vertical_place": (45, 500, 3),
+        "ego_teaser": (45, 1200, 10),
+        "robot_unitree": (45, 500, 4),
+        "droid_robot": (35, 400, 5),
+        "robot_2": (45, 256, 5),
+        "cinema_0": (45, 356, 5),
+        "cinema_1": (45, 756, 3),
+    }
+    return video_settings.get(video_name, (50, 756, 3))
+def test_backend_connection():
+    """Test if backend is actually working"""
+    global BACKEND_AVAILABLE
+    if not backend_client:
+        return False
     try:
+        print("Testing backend connection with a simple call...")
+        # Check if we have fns available
+        if hasattr(backend_client, 'fns') and backend_client.fns:
+            print("✅ Backend API functions are available")
+            print(f"🔧 Available function indices: {list(backend_client.fns.keys())}")
+            return True
         else:
+            print("❌ Backend API functions not found")
+            return False
     except Exception as e:
+        print(f"❌ Backend connection test failed: {e}")
+        return False
+def test_backend_api():
+    """Test specific backend API functions"""
+    if not BACKEND_AVAILABLE or not backend_client:
+        print("❌ Backend not available for testing")
+        return False
+    try:
+        print("🧪 Testing backend API functions...")
+        # Test if fns exist and show available indices
+        if hasattr(backend_client, 'fns') and backend_client.fns:
+            print(f"✅ Backend has {len(backend_client.fns)} functions available")
+            for idx in backend_client.fns.keys():
+                print(f"✅ Function {idx} is available")
+        else:
+            print("❌ No functions found in backend API")
+            return False
+        return True
+    except Exception as e:
+        print(f"❌ Backend API test failed: {e}")
+        return False
+# Initialize the backend connection
+print("🚀 Initializing frontend application...")
+result = initialize_backend()
+# Test backend connection if available
+if result and BACKEND_AVAILABLE:
+    print("✅ Backend connection successful!")
+else:
+    print("❌ Backend connection failed!")
+# Create the Gradio interface
+print("🎨 Creating Gradio interface...")
+with gr.Blocks(
+    theme=gr.themes.Soft(),
+    title="SpatialTracker V2 - Frontend",
+    css="""
+    .gradio-container {
+        max-width: 1200px !important;
+        margin: auto !important;
+    }
+    .gr-button {
+        margin: 5px;
+    }
+    .gr-form {
+        background: white;
+        border-radius: 10px;
+        padding: 20px;
+        box-shadow: 0 2px 10px rgba(0,0,0,0.1);
+    }
+    /* 固定视频上传组件高度 */
+    .gr-video {
+        height: 300px !important;
+        min-height: 300px !important;
+        max-height: 300px !important;
+    }
+    .gr-video video {
+        height: 260px !important;
+        max-height: 260px !important;
+        object-fit: contain !important;
+        background: #f8f9fa;
+    }
+    .gr-video .gr-video-player {
+        height: 260px !important;
+        max-height: 260px !important;
+    }
+    /* 水平滚动的示例视频样式 */
+    .example-videos .gr-examples {
+        overflow: visible !important;
+    }
+    .example-videos .gr-examples .gr-table-wrapper {
+        overflow-x: auto !important;
+        overflow-y: hidden !important;
+        scrollbar-width: thin;
+        scrollbar-color: #667eea #f1f1f1;
+    }
+    .example-videos .gr-examples .gr-table-wrapper::-webkit-scrollbar {
+        height: 8px;
+    }
+    .example-videos .gr-examples .gr-table-wrapper::-webkit-scrollbar-track {
+        background: #f1f1f1;
+        border-radius: 4px;
+    }
+    .example-videos .gr-examples .gr-table-wrapper::-webkit-scrollbar-thumb {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        border-radius: 4px;
+    }
+    .example-videos .gr-examples .gr-table-wrapper::-webkit-scrollbar-thumb:hover {
+        background: linear-gradient(135deg, #5a6fd8 0%, #6a4190 100%);
+    }
+    .example-videos .gr-examples .gr-table {
+        display: flex !important;
+        flex-wrap: nowrap !important;
+        min-width: max-content !important;
+        gap: 10px !important;
+    }
+    .example-videos .gr-examples .gr-table tbody {
+        display: flex !important;
+        flex-direction: row !important;
+        flex-wrap: nowrap !important;
+        gap: 10px !important;
+    }
+    .example-videos .gr-examples .gr-table tbody tr {
+        display: flex !important;
+        flex-direction: column !important;
+        min-width: 120px !important;
+        max-width: 120px !important;
+        margin: 0 !important;
+        background: white;
+        border-radius: 8px;
+        box-shadow: 0 2px 8px rgba(0,0,0,0.1);
+        transition: all 0.3s ease;
+        cursor: pointer;
+    }
+    .example-videos .gr-examples .gr-table tbody tr:hover {
+        transform: translateY(-2px);
+        box-shadow: 0 4px 12px rgba(102, 126, 234, 0.2);
+    }
+    .example-videos .gr-examples .gr-table tbody tr td {
+        text-align: center !important;
+        padding: 8px !important;
+        border: none !important;
+    }
+    .example-videos .gr-examples .gr-table tbody tr td video {
+        border-radius: 6px !important;
+        width: 100% !important;
+        height: auto !important;
+    }
+    .example-videos .gr-examples .gr-table tbody tr td:last-child {
+        font-size: 12px !important;
+        font-weight: 500 !important;
+        color: #333 !important;
+        padding-top: 4px !important;
+    }
+    /* 新的水平滚动示例视频样式 */
+    .horizontal-examples .gr-examples {
+        overflow: visible !important;
+    }
+    .horizontal-examples .gr-examples .gr-table-wrapper {
+        overflow-x: auto !important;
+        overflow-y: hidden !important;
+        scrollbar-width: thin;
+        scrollbar-color: #667eea #f1f1f1;
+        padding: 10px 0;
+    }
+    .horizontal-examples .gr-examples .gr-table-wrapper::-webkit-scrollbar {
+        height: 8px;
+    }
+    .horizontal-examples .gr-examples .gr-table-wrapper::-webkit-scrollbar-track {
+        background: #f1f1f1;
+        border-radius: 4px;
+    }
+    .horizontal-examples .gr-examples .gr-table-wrapper::-webkit-scrollbar-thumb {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        border-radius: 4px;
+    }
+    .horizontal-examples .gr-examples .gr-table-wrapper::-webkit-scrollbar-thumb:hover {
+        background: linear-gradient(135deg, #5a6fd8 0%, #6a4190 100%);
+    }
+    .horizontal-examples .gr-examples .gr-table {
+        display: flex !important;
+        flex-wrap: nowrap !important;
+        min-width: max-content !important;
+        gap: 15px !important;
+        padding-bottom: 10px;
+    }
+    .horizontal-examples .gr-examples .gr-table tbody {
+        display: flex !important;
+        flex-direction: row !important;
+        flex-wrap: nowrap !important;
+        gap: 15px !important;
+    }
+    .horizontal-examples .gr-examples .gr-table tbody tr {
+        display: flex !important;
+        flex-direction: column !important;
+        min-width: 160px !important;
+        max-width: 160px !important;
+        margin: 0 !important;
+        background: white;
+        border-radius: 12px;
+        box-shadow: 0 3px 12px rgba(0,0,0,0.12);
+        transition: all 0.3s ease;
+        cursor: pointer;
+        overflow: hidden;
+    }
+    .horizontal-examples .gr-examples .gr-table tbody tr:hover {
+        transform: translateY(-4px);
+        box-shadow: 0 8px 20px rgba(102, 126, 234, 0.25);
+    }
+    .horizontal-examples .gr-examples .gr-table tbody tr td {
+        text-align: center !important;
+        padding: 0 !important;
+        border: none !important;
+    }
+    .horizontal-examples .gr-examples .gr-table tbody tr td:first-child {
+        padding: 0 !important;
+    }
+    .horizontal-examples .gr-examples .gr-table tbody tr td video {
+        border-radius: 8px 8px 0 0 !important;
+        width: 100% !important;
+        height: 90px !important;
+        object-fit: cover !important;
+    }
+    .horizontal-examples .gr-examples .gr-table tbody tr td:last-child {
+        font-size: 11px !important;
+        font-weight: 600 !important;
+        color: #333 !important;
+        padding: 8px 12px !important;
+        background: linear-gradient(135deg, #f8f9ff 0%, #e6f3ff 100%);
+        border-radius: 0 0 8px 8px;
+    }
+    """
+) as demo:
+    gr.Markdown("""
+    # 🎯 SpatialTracker V2 - Frontend Interface
+    Welcome to SpatialTracker V2! This interface allows you to track objects in videos using advanced computer vision techniques.
+    **Instructions:**
+    1. Upload a video file or select from examples below
+    2. Click on the object you want to track in the first frame
+    3. Adjust tracking parameters if needed
+    4. Click "Launch Visualization" to start tracking
+    """)
+    # Status indicator with more detailed information
+    if BACKEND_AVAILABLE:
+        status_text = "🟢 Backend Connected"
+        status_details = f"Connected to: {BACKEND_SPACE_URL}"
+    else:
+        status_text = "🟡 Running in Standalone Mode"
+        status_details = f"Backend unavailable: {BACKEND_SPACE_URL}"
+    gr.Markdown(f"**Status:** {status_text}")
+    gr.Markdown(f"<small style='color: #666;'>{status_details}</small>", elem_id="status-details")
+    # GitHub Star Reminder - Added back!
+    gr.HTML("""
+    <div style='background: linear-gradient(135deg, #e8eaff 0%, #f0f2ff 100%);
+                border-radius: 10px;
+                padding: 15px;
+                margin: 15px 0;
+                box-shadow: 0 2px 8px rgba(102, 126, 234, 0.1);
+                border: 1px solid rgba(102, 126, 234, 0.15);'>
+        <div style='text-align: center; color: #4a5568;'>
+            <h3 style='margin: 0 0 10px 0; font-size: 18px; text-shadow: none; color: #2d3748;'>
+                ⭐ Love SpatialTracker? Give us a Star! ⭐
+            </h3>
+            <p style='margin: 0 0 12px 0; font-size: 14px; opacity: 0.8; color: #4a5568;'>
+                Help us grow by starring our repository on GitHub! 🚀
+            </p>
+            <div style='display: flex; justify-content: center;'>
+                <a href="https://github.com/henry123-boy/SpaTrackerV2"
+                   target="_blank"
+                   style='display: inline-flex;
+                          align-items: center;
+                          gap: 6px;
+                          background: rgba(102, 126, 234, 0.1);
+                          color: #4a5568;
+                          padding: 8px 16px;
+                          border-radius: 20px;
+                          text-decoration: none;
+                          font-weight: bold;
+                          font-size: 14px;
+                          backdrop-filter: blur(5px);
+                          border: 1px solid rgba(102, 126, 234, 0.2);
+                          transition: all 0.3s ease;'
+                   onmouseover="this.style.background='rgba(102, 126, 234, 0.15)'; this.style.transform='translateY(-1px)'"
+                   onmouseout="this.style.background='rgba(102, 126, 234, 0.1)'; this.style.transform='translateY(0)'">
+                    <span style='font-size: 16px;'>⭐</span>
+                    Star on GitHub
+                </a>
+            </div>
+        </div>
+    </div>
+    """)
+    # Example videos section - moved to top
+    with gr.Group(elem_classes=["example-videos"]):
+        gr.Markdown("### 📂 Example Videos")
+        gr.Markdown("Try these example videos to get started quickly:")
+        # Custom horizontal scrolling video gallery
+        gr.HTML("""
+        <div style='background-color: #f8f9ff; border-radius: 8px; padding: 10px; margin: 10px 0; border-left: 4px solid #667eea;'>
+            <p style='margin: 0; font-size: 13px; color: #666; display: flex; align-items: center; gap: 8px;'>
+                <span style='font-size: 16px;'>💡</span>
+                <strong>Tip:</strong> Scroll horizontally below to see all example videos
+            </p>
+        </div>
+        """)
+        # Define video_input here so it can be referenced in examples
+        video_input = gr.Video(
+            label="Upload Video or Select Example",
+            format="mp4",
+            height=300
+        )
+        # Create a horizontal scrolling container for the examples
+        with gr.Group(elem_classes=["horizontal-examples"]):
+            gr.Examples(
+                examples=[
+                    ["examples/kiss.mp4"],
+                    ["examples/backpack.mp4"],
+                    ["examples/kitchen.mp4"],
+                    ["examples/pillow.mp4"],
+                    ["examples/hockey.mp4"],
+                    ["examples/drifting.mp4"],
+                    ["examples/ball.mp4"],
+                    ["examples/ken_block_0.mp4"],
+                    ["examples/ego_kc1.mp4"],
+                    ["examples/vertical_place.mp4"],
+                    ["examples/ego_teaser.mp4"],
+                    ["examples/robot_unitree.mp4"],
+                    ["examples/droid_robot.mp4"],
+                    ["examples/robot_2.mp4"],
+                    ["examples/cinema_0.mp4"],
+                    ["examples/cinema_1.mp4"],
+                ],
+                inputs=video_input,
+                label="🎬 Click on any example to load it",
+                examples_per_page=16  # Show all examples on one page
+            )
+    with gr.Row():
+        with gr.Column(scale=1):
+            # Interactive frame display
+            with gr.Group():
+                gr.Markdown("### 🎯 Point Selection")
+                gr.Markdown("Click on the object you want to track in the frame below:")
+                interactive_frame = gr.Image(
+                    label="Click to select tracking points",
+                    type="numpy",
+                    interactive=True
+                )
+                with gr.Row():
+                    point_type = gr.Radio(
+                        choices=["positive_point", "negative_point"],
+                        value="positive_point",
+                        label="Point Type",
+                        info="Positive points indicate the object to track, negative points indicate areas to avoid"
+                    )
+                with gr.Row():
+                    reset_points_btn = gr.Button("🔄 Reset Points", variant="secondary")
+                    clear_all_btn = gr.Button("🗑️ Clear All", variant="stop")
+        with gr.Column(scale=1):
+            # Tracking results
+            with gr.Group():
+                gr.Markdown("### 🎬 Tracking Results")
+                tracking_result_video = gr.Video(
+                    label="Tracking Result Video",
+                    interactive=False,
+                    height=300
+                )
+            # 3D Visualization
+            with gr.Group():
+                gr.Markdown("### 🌐 3D Visualization")
+                viz_html = gr.HTML(
+                    label="3D Trajectory Visualization",
+                    value="<p>Upload a video and select points to see 3D visualization here.</p>"
+                )
+    # Advanced settings section - changed to open=True
+    with gr.Accordion("⚙️ Advanced Settings", open=True):
+        gr.Markdown("Adjust these parameters to optimize tracking performance:")
+        with gr.Row():
+            grid_size = gr.Slider(
+                minimum=10,
+                maximum=100,
+                step=10,
+                value=50,
+                label="Grid Size",
+                info="Size of the tracking grid (larger = more detailed)"
+            )
+            vo_points = gr.Slider(
+                minimum=100,
+                maximum=2000,
+                step=50,
+                value=756,
+                label="VO Points",
+                info="Number of visual odometry points (more = better accuracy)"
+            )
+            fps = gr.Slider(
+                minimum=1,
+                maximum=30,
+                step=1,
+                value=3,
+                label="FPS",
+                info="Frames per second for processing (higher = smoother but slower)"
+            )
+    # Launch button
+    with gr.Row():
+        launch_btn = gr.Button("🚀 Launch Visualization", variant="primary", size="lg")
+    # Hidden state variables
+    original_image_state = gr.State(None)
+    selected_points = gr.State([])
+    # Event handlers
+    video_input.change(
+        fn=handle_video_upload,
+        inputs=[video_input],
+        outputs=[original_image_state, interactive_frame, selected_points, grid_size, vo_points, fps]
+    )
+    interactive_frame.select(
+        fn=select_point,
+        inputs=[original_image_state, selected_points, point_type],
+        outputs=[interactive_frame, selected_points]
+    )
+    reset_points_btn.click(
+        fn=reset_points,
+        inputs=[original_image_state, selected_points],
+        outputs=[interactive_frame, selected_points]
+    )
+    clear_all_btn.click(
+        fn=clear_all,
+        outputs=[video_input, interactive_frame, selected_points, grid_size, vo_points, fps]
+    )
+    launch_btn.click(
+        fn=launch_viz,
+        inputs=[grid_size, vo_points, fps, original_image_state],
+        outputs=[viz_html, tracking_result_video]
+    )
+# Launch the interface
+if __name__ == "__main__":
+    print("🌟 Launching SpatialTracker V2 Frontend...")
+    print(f"🔗 Backend Status: {'Connected' if BACKEND_AVAILABLE else 'Disconnected'}")
+    demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
+        share=True,
         debug=True,
         show_error=True
     )