Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import os | |
| import json | |
| import numpy as np | |
| import cv2 | |
| import base64 | |
| from typing import List, Tuple | |
| # Backend Space URL - replace with your actual backend space URL | |
| BACKEND_SPACE_URL = "Yuxihenry/SpatialTrackerV2_Backend" # Replace with actual backend space URL | |
| hf_token = os.getenv("HF_TOKEN") # Replace with your actual Hugging Face token | |
| # Flag to track if backend is available | |
| BACKEND_AVAILABLE = False | |
| backend_client = None | |
| def initialize_backend(): | |
| """Initialize backend connection using gradio_client""" | |
| global backend_client, BACKEND_AVAILABLE | |
| try: | |
| print(f"Attempting to connect to backend: {BACKEND_SPACE_URL}") | |
| # Use gradio_client for proper API access | |
| from gradio_client import Client | |
| backend_client = Client(f"https://huggingface.co/spaces/{BACKEND_SPACE_URL}", hf_token=hf_token) | |
| print(f"✅ Backend connection successful!") | |
| print(f"🔧 Backend client: {backend_client}") | |
| BACKEND_AVAILABLE = True | |
| return True | |
| except Exception as e: | |
| print(f"❌ Backend connection failed: {e}") | |
| print("⚠️ Running in standalone mode (some features may be limited)") | |
| BACKEND_AVAILABLE = False | |
| return False | |
| def numpy_to_base64(arr): | |
| """Convert numpy array to base64 string""" | |
| return base64.b64encode(arr.tobytes()).decode('utf-8') | |
| def base64_to_numpy(b64_str, shape, dtype): | |
| """Convert base64 string back to numpy array""" | |
| return np.frombuffer(base64.b64decode(b64_str), dtype=dtype).reshape(shape) | |
| def base64_to_image(b64_str): | |
| """Convert base64 string to numpy image array""" | |
| if not b64_str: | |
| return None | |
| try: | |
| # Decode base64 to bytes | |
| img_bytes = base64.b64decode(b64_str) | |
| # Convert bytes to numpy array | |
| nparr = np.frombuffer(img_bytes, np.uint8) | |
| # Decode image | |
| img = cv2.imdecode(nparr, cv2.IMREAD_COLOR) | |
| # Convert BGR to RGB | |
| img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) | |
| return img | |
| except Exception as e: | |
| print(f"Error converting base64 to image: {e}") | |
| return None | |
| def get_video_name(video_path): | |
| """Extract video name without extension""" | |
| return os.path.splitext(os.path.basename(video_path))[0] | |
| def extract_first_frame(video_path): | |
| """Extract first frame from video file""" | |
| try: | |
| cap = cv2.VideoCapture(video_path) | |
| ret, frame = cap.read() | |
| cap.release() | |
| if ret: | |
| # Convert BGR to RGB | |
| frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) | |
| return frame_rgb | |
| else: | |
| return None | |
| except Exception as e: | |
| print(f"Error extracting first frame: {e}") | |
| return None | |
| def handle_video_upload(video): | |
| """Handle video upload and extract first frame""" | |
| if video is None: | |
| return None, None, [], 50, 756, 3 | |
| try: | |
| if BACKEND_AVAILABLE and backend_client: | |
| # Try to use backend API | |
| try: | |
| print("🔧 Calling backend API for video upload...") | |
| # Call the unified API with upload_video function type | |
| result = backend_client.predict( | |
| "upload_video", # function_type | |
| video, # video file | |
| "", # original_image_state (not used for upload) | |
| [], # selected_points (not used for upload) | |
| "positive_point", # point_type (not used for upload) | |
| 0, # point_x (not used for upload) | |
| 0, # point_y (not used for upload) | |
| 50, # grid_size (not used for upload) | |
| 756, # vo_points (not used for upload) | |
| 3, # fps (not used for upload) | |
| api_name="/predict" | |
| ) | |
| print(f"✅ Backend video upload API call successful!") | |
| print(f"🔧 Result type: {type(result)}") | |
| print(f"🔧 Result: {result}") | |
| # Parse the result - expect a dict with success status | |
| if isinstance(result, dict) and result.get("success"): | |
| # Extract data from backend response | |
| original_image_state = result.get("original_image_state", "") | |
| display_image = result.get("display_image", None) | |
| selected_points = result.get("selected_points", []) | |
| # Get video settings based on video name | |
| video_name = get_video_name(video) | |
| grid_size_val, vo_points_val, fps_val = get_video_settings(video_name) | |
| return original_image_state, display_image, selected_points, grid_size_val, vo_points_val, fps_val | |
| else: | |
| print("Backend processing failed, using local fallback") | |
| # Fallback to local processing | |
| pass | |
| except Exception as e: | |
| print(f"Backend API call failed: {e}") | |
| # Fallback to local processing | |
| pass | |
| # Fallback: local processing | |
| print("Using local video processing...") | |
| display_image = extract_first_frame(video) | |
| # Create a simple state representation | |
| original_image_state = json.dumps({ | |
| "video_path": video, | |
| "frame": "local_processing" | |
| }) | |
| # Get video settings | |
| video_name = get_video_name(video) | |
| grid_size_val, vo_points_val, fps_val = get_video_settings(video_name) | |
| return original_image_state, display_image, [], grid_size_val, vo_points_val, fps_val | |
| except Exception as e: | |
| print(f"Error in handle_video_upload: {e}") | |
| return None, None, [], 50, 756, 3 | |
| def select_point(original_img: str, sel_pix: list, point_type: str, evt: gr.SelectData): | |
| """Handle point selection for SAM""" | |
| if original_img is None: | |
| return None, [] | |
| try: | |
| if BACKEND_AVAILABLE and backend_client: | |
| # Try to use backend API | |
| try: | |
| print(f"🔧 Calling backend select point API: x={evt.index[0]}, y={evt.index[1]}, type={point_type}") | |
| # Call the unified API with select_point function type | |
| result = backend_client.predict( | |
| "select_point", # function_type | |
| None, # video file (not used for select_point) | |
| original_img, # original_image_state | |
| sel_pix, # selected_points | |
| point_type, # point_type | |
| evt.index[0], # point_x | |
| evt.index[1], # point_y | |
| 50, # grid_size (not used for select_point) | |
| 756, # vo_points (not used for select_point) | |
| 3, # fps (not used for select_point) | |
| api_name="/predict" | |
| ) | |
| print(f"✅ Backend select point API call successful!") | |
| print(f"🔧 Result type: {type(result)}") | |
| print(f"🔧 Result: {result}") | |
| # Parse the result - expect a dict with success status | |
| if isinstance(result, dict) and result.get("success"): | |
| display_image = result.get("display_image", None) | |
| new_sel_pix = result.get("selected_points", sel_pix) | |
| return display_image, new_sel_pix | |
| else: | |
| print("Backend processing failed, using local fallback") | |
| # Fallback to local processing | |
| pass | |
| except Exception as e: | |
| print(f"Backend API call failed: {e}") | |
| # Fallback to local processing | |
| pass | |
| # Fallback: local processing with improved visualization | |
| print("Using local point selection with enhanced visualization...") | |
| # Parse original image state | |
| try: | |
| state_data = json.loads(original_img) | |
| video_path = state_data.get("video_path") | |
| except: | |
| video_path = None | |
| if video_path: | |
| # Re-extract frame and add point with mask visualization | |
| display_image = extract_first_frame(video_path) | |
| if display_image is not None: | |
| # Add point to the image with enhanced visualization | |
| x, y = evt.index[0], evt.index[1] | |
| color = (0, 255, 0) if point_type == 'positive_point' else (255, 0, 0) | |
| # Draw a larger, more visible point | |
| cv2.circle(display_image, (x, y), 8, color, -1) | |
| cv2.circle(display_image, (x, y), 12, (255, 255, 255), 2) | |
| # Add point to selected points list | |
| new_sel_pix = sel_pix.copy() | |
| new_sel_pix.append([x, y, point_type]) | |
| return display_image, new_sel_pix | |
| return None, [] | |
| except Exception as e: | |
| print(f"Error in select_point: {e}") | |
| return None, [] | |
| def reset_points(original_img: str, sel_pix): | |
| """Reset points and restore original image""" | |
| if original_img is None: | |
| return None, [] | |
| try: | |
| if BACKEND_AVAILABLE and backend_client: | |
| # Try to use backend API | |
| try: | |
| print("🔧 Calling backend reset points API...") | |
| # Call the unified API with reset_points function type | |
| result = backend_client.predict( | |
| "reset_points", # function_type | |
| None, # video file (not used for reset_points) | |
| original_img, # original_image_state | |
| sel_pix, # selected_points | |
| "positive_point", # point_type (not used for reset_points) | |
| 0, # point_x (not used for reset_points) | |
| 0, # point_y (not used for reset_points) | |
| 50, # grid_size (not used for reset_points) | |
| 756, # vo_points (not used for reset_points) | |
| 3, # fps (not used for reset_points) | |
| api_name="/predict" | |
| ) | |
| print(f"✅ Backend reset points API call successful!") | |
| print(f"🔧 Result: {result}") | |
| # Parse the result | |
| if isinstance(result, dict) and result.get("success"): | |
| display_image = result.get("display_image", None) | |
| new_sel_pix = result.get("selected_points", []) | |
| return display_image, new_sel_pix | |
| else: | |
| print("Backend processing failed, using local fallback") | |
| # Fallback to local processing | |
| pass | |
| except Exception as e: | |
| print(f"Backend API call failed: {e}") | |
| # Fallback to local processing | |
| pass | |
| # Fallback: local processing | |
| print("Using local reset points...") | |
| # Parse original image state | |
| try: | |
| state_data = json.loads(original_img) | |
| video_path = state_data.get("video_path") | |
| except: | |
| video_path = None | |
| if video_path: | |
| # Re-extract original frame | |
| display_image = extract_first_frame(video_path) | |
| return display_image, [] | |
| return None, [] | |
| except Exception as e: | |
| print(f"Error in reset_points: {e}") | |
| return None, [] | |
| def launch_viz(grid_size, vo_points, fps, original_image_state): | |
| """Launch visualization with user-specific temp directory""" | |
| if original_image_state is None: | |
| return None, None | |
| try: | |
| if BACKEND_AVAILABLE and backend_client: | |
| # Try to use backend API | |
| try: | |
| print(f"🔧 Calling backend API with parameters: grid_size={grid_size}, vo_points={vo_points}, fps={fps}") | |
| print(f"🔧 Original image state type: {type(original_image_state)}") | |
| print(f"🔧 Original image state preview: {str(original_image_state)[:100]}...") | |
| # Call the unified API with run_tracker function type | |
| result = backend_client.predict( | |
| "run_tracker", # function_type | |
| None, # video file (not used for run_tracker) | |
| original_image_state, # original_image_state | |
| [], # selected_points (not used for run_tracker) | |
| "positive_point", # point_type (not used for run_tracker) | |
| 0, # point_x (not used for run_tracker) | |
| 0, # point_y (not used for run_tracker) | |
| grid_size, # grid_size | |
| vo_points, # vo_points | |
| fps, # fps | |
| api_name="/predict" | |
| ) | |
| print(f"✅ Backend API call successful!") | |
| print(f"🔧 Result type: {type(result)}") | |
| print(f"🔧 Result: {result}") | |
| # Parse the result | |
| if isinstance(result, dict) and result.get("success"): | |
| viz_html = result.get("viz_html", "") | |
| track_video_path = result.get("track_video_path", "") | |
| return viz_html, track_video_path | |
| else: | |
| print("Backend processing failed, showing error message") | |
| # Fallback to error message | |
| pass | |
| except Exception as e: | |
| print(f"❌ Backend API call failed: {e}") | |
| print(f"🔧 Error type: {type(e)}") | |
| print(f"🔧 Error details: {str(e)}") | |
| # Fallback to local processing | |
| pass | |
| # Fallback: show message that backend is required | |
| error_message = f""" | |
| <div style='border: 3px solid #ff6b6b; border-radius: 10px; padding: 20px; background-color: #fff5f5;'> | |
| <h3 style='color: #d63031; margin-bottom: 15px;'>⚠️ Backend Connection Required</h3> | |
| <p style='color: #2d3436; line-height: 1.6;'> | |
| The tracking and visualization features require a connection to the backend Space. | |
| Please ensure: | |
| </p> | |
| <ul style='color: #2d3436; line-height: 1.6;'> | |
| <li>The backend Space is deployed and running</li> | |
| <li>The BACKEND_SPACE_URL is correctly configured</li> | |
| <li>You have proper access permissions to the backend Space</li> | |
| </ul> | |
| <div style='background-color: #f8f9fa; border-radius: 5px; padding: 10px; margin-top: 10px;'> | |
| <p style='color: #2d3436; font-weight: bold; margin: 0 0 5px 0;'>Debug Information:</p> | |
| <p style='color: #666; font-size: 12px; margin: 0;'>Backend Available: {BACKEND_AVAILABLE}</p> | |
| <p style='color: #666; font-size: 12px; margin: 0;'>Backend Client: {backend_client is not None}</p> | |
| <p style='color: #666; font-size: 12px; margin: 0;'>Backend URL: {BACKEND_SPACE_URL}</p> | |
| <p style='color: #666; font-size: 12px; margin: 0;'>Client Type: {type(backend_client) if backend_client else 'None'}</p> | |
| </div> | |
| <p style='color: #2d3436; font-weight: bold; margin-top: 15px;'> | |
| Current Status: Backend unavailable - Running in limited mode | |
| </p> | |
| </div> | |
| """ | |
| return error_message, None | |
| except Exception as e: | |
| print(f"Error in launch_viz: {e}") | |
| return None, None | |
| def clear_all(): | |
| """Clear all buffers and temporary files""" | |
| return None, None, [], 50, 756, 3 | |
| def update_tracker_model(model_name): | |
| """Update tracker model (placeholder function)""" | |
| return | |
| def get_video_settings(video_name): | |
| """Get video-specific settings based on video name""" | |
| video_settings = { | |
| "kiss": (45, 700, 10), | |
| "backpack": (40, 600, 2), | |
| "kitchen": (60, 800, 3), | |
| "pillow": (35, 500, 2), | |
| "hockey": (45, 700, 2), | |
| "drifting": (35, 1000, 6), | |
| "ball": (45, 256, 6), | |
| "ken_block_0": (45, 700, 2), | |
| "ego_kc1": (45, 500, 4), | |
| "vertical_place": (45, 500, 3), | |
| "ego_teaser": (45, 1200, 10), | |
| "robot_unitree": (45, 500, 4), | |
| "droid_robot": (35, 400, 5), | |
| "robot_2": (45, 256, 5), | |
| "cinema_0": (45, 356, 5), | |
| "cinema_1": (45, 756, 3), | |
| } | |
| return video_settings.get(video_name, (50, 756, 3)) | |
| def test_backend_connection(): | |
| """Test if backend is actually working""" | |
| global BACKEND_AVAILABLE | |
| if not backend_client: | |
| return False | |
| try: | |
| print("Testing backend connection with a simple call...") | |
| # Check if we have fns available | |
| if hasattr(backend_client, 'fns') and backend_client.fns: | |
| print("✅ Backend API functions are available") | |
| print(f"🔧 Available function indices: {list(backend_client.fns.keys())}") | |
| return True | |
| else: | |
| print("❌ Backend API functions not found") | |
| return False | |
| except Exception as e: | |
| print(f"❌ Backend connection test failed: {e}") | |
| return False | |
| def test_backend_api(): | |
| """Test specific backend API functions""" | |
| if not BACKEND_AVAILABLE or not backend_client: | |
| print("❌ Backend not available for testing") | |
| return False | |
| try: | |
| print("🧪 Testing backend API functions...") | |
| # Test if fns exist and show available indices | |
| if hasattr(backend_client, 'fns') and backend_client.fns: | |
| print(f"✅ Backend has {len(backend_client.fns)} functions available") | |
| for idx in backend_client.fns.keys(): | |
| print(f"✅ Function {idx} is available") | |
| else: | |
| print("❌ No functions found in backend API") | |
| return False | |
| return True | |
| except Exception as e: | |
| print(f"❌ Backend API test failed: {e}") | |
| return False | |
| # Initialize the backend connection | |
| print("🚀 Initializing frontend application...") | |
| initialize_backend() | |
| # Test backend connection if available | |
| if BACKEND_AVAILABLE: | |
| print("🧪 Testing backend connection...") | |
| test_result = test_backend_connection() | |
| if test_result: | |
| print("✅ Backend connection test passed!") | |
| test_backend_api() | |
| else: | |
| print("❌ Backend connection test failed!") | |
| BACKEND_AVAILABLE = False | |
| # Create the Gradio interface | |
| print("🎨 Creating Gradio interface...") | |
| with gr.Blocks( | |
| theme=gr.themes.Soft(), | |
| title="SpatialTracker V2 - Frontend", | |
| css=""" | |
| .gradio-container { | |
| max-width: 1200px !important; | |
| margin: auto !important; | |
| } | |
| .gr-button { | |
| margin: 5px; | |
| } | |
| .gr-form { | |
| background: white; | |
| border-radius: 10px; | |
| padding: 20px; | |
| box-shadow: 0 2px 10px rgba(0,0,0,0.1); | |
| } | |
| """ | |
| ) as demo: | |
| gr.Markdown(""" | |
| # 🎯 SpatialTracker V2 - Frontend Interface | |
| Welcome to SpatialTracker V2! This interface allows you to track objects in videos using advanced computer vision techniques. | |
| **Instructions:** | |
| 1. Upload a video file or select from examples below | |
| 2. Click on the object you want to track in the first frame | |
| 3. Adjust tracking parameters if needed | |
| 4. Click "Launch Visualization" to start tracking | |
| """) | |
| # Status indicator | |
| status_text = "🟢 Backend Connected" if BACKEND_AVAILABLE else "🟡 Running in Standalone Mode" | |
| gr.Markdown(f"**Status:** {status_text}") | |
| # Example videos section - moved to top | |
| with gr.Group(): | |
| gr.Markdown("### 📂 Example Videos") | |
| gr.Markdown("Try these example videos to get started quickly:") | |
| # Define video_input here so it can be referenced in examples | |
| video_input = gr.Video( | |
| label="Upload Video or Select Example", | |
| format="mp4" | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| ["examples/kiss.mp4"], | |
| ["examples/backpack.mp4"], | |
| ["examples/kitchen.mp4"], | |
| ["examples/pillow.mp4"], | |
| ["examples/hockey.mp4"], | |
| ["examples/drifting.mp4"], | |
| ["examples/ball.mp4"], | |
| ["examples/ken_block_0.mp4"], | |
| ["examples/ego_kc1.mp4"], | |
| ["examples/vertical_place.mp4"], | |
| ["examples/ego_teaser.mp4"], | |
| ["examples/robot_unitree.mp4"], | |
| ["examples/droid_robot.mp4"], | |
| ["examples/robot_2.mp4"], | |
| ["examples/cinema_0.mp4"], | |
| ["examples/cinema_1.mp4"], | |
| ], | |
| inputs=video_input, | |
| label="Click on any example to load it" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| # Interactive frame display | |
| with gr.Group(): | |
| gr.Markdown("### 🎯 Point Selection") | |
| gr.Markdown("Click on the object you want to track in the frame below:") | |
| interactive_frame = gr.Image( | |
| label="Click to select tracking points", | |
| type="numpy", | |
| interactive=True | |
| ) | |
| with gr.Row(): | |
| point_type = gr.Radio( | |
| choices=["positive_point", "negative_point"], | |
| value="positive_point", | |
| label="Point Type", | |
| info="Positive points indicate the object to track, negative points indicate areas to avoid" | |
| ) | |
| with gr.Row(): | |
| reset_points_btn = gr.Button("🔄 Reset Points", variant="secondary") | |
| clear_all_btn = gr.Button("🗑️ Clear All", variant="stop") | |
| with gr.Column(scale=1): | |
| # Tracking results | |
| with gr.Group(): | |
| gr.Markdown("### 🎬 Tracking Results") | |
| tracking_result_video = gr.Video( | |
| label="Tracking Result Video", | |
| interactive=False | |
| ) | |
| # 3D Visualization | |
| with gr.Group(): | |
| gr.Markdown("### 🌐 3D Visualization") | |
| viz_html = gr.HTML( | |
| label="3D Trajectory Visualization", | |
| value="<p>Upload a video and select points to see 3D visualization here.</p>" | |
| ) | |
| # Advanced settings section - changed to open=True | |
| with gr.Accordion("⚙️ Advanced Settings", open=True): | |
| gr.Markdown("Adjust these parameters to optimize tracking performance:") | |
| with gr.Row(): | |
| grid_size = gr.Slider( | |
| minimum=10, | |
| maximum=100, | |
| step=10, | |
| value=50, | |
| label="Grid Size", | |
| info="Size of the tracking grid (larger = more detailed)" | |
| ) | |
| vo_points = gr.Slider( | |
| minimum=100, | |
| maximum=2000, | |
| step=50, | |
| value=756, | |
| label="VO Points", | |
| info="Number of visual odometry points (more = better accuracy)" | |
| ) | |
| fps = gr.Slider( | |
| minimum=1, | |
| maximum=30, | |
| step=1, | |
| value=3, | |
| label="FPS", | |
| info="Frames per second for processing (higher = smoother but slower)" | |
| ) | |
| # Launch button | |
| with gr.Row(): | |
| launch_btn = gr.Button("🚀 Launch Visualization", variant="primary", size="lg") | |
| # Hidden state variables | |
| original_image_state = gr.State(None) | |
| selected_points = gr.State([]) | |
| # Event handlers | |
| video_input.change( | |
| fn=handle_video_upload, | |
| inputs=[video_input], | |
| outputs=[original_image_state, interactive_frame, selected_points, grid_size, vo_points, fps] | |
| ) | |
| interactive_frame.select( | |
| fn=select_point, | |
| inputs=[original_image_state, selected_points, point_type], | |
| outputs=[interactive_frame, selected_points] | |
| ) | |
| reset_points_btn.click( | |
| fn=reset_points, | |
| inputs=[original_image_state, selected_points], | |
| outputs=[interactive_frame, selected_points] | |
| ) | |
| clear_all_btn.click( | |
| fn=clear_all, | |
| outputs=[video_input, interactive_frame, selected_points, grid_size, vo_points, fps] | |
| ) | |
| launch_btn.click( | |
| fn=launch_viz, | |
| inputs=[grid_size, vo_points, fps, original_image_state], | |
| outputs=[viz_html, tracking_result_video] | |
| ) | |
| # Launch the interface | |
| if __name__ == "__main__": | |
| print("🌟 Launching SpatialTracker V2 Frontend...") | |
| print(f"🔗 Backend Status: {'Connected' if BACKEND_AVAILABLE else 'Disconnected'}") | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=True, | |
| debug=True, | |
| show_error=True | |
| ) |