xiaoyuxi
backend
b6d15ed
raw
history blame
26 kB
import gradio as gr
import os
import json
import numpy as np
import cv2
import base64
from typing import List, Tuple
# Backend Space URL - replace with your actual backend space URL
BACKEND_SPACE_URL = "Yuxihenry/SpatialTrackerV2_Backend" # Replace with actual backend space URL
hf_token = os.getenv("HF_TOKEN") # Replace with your actual Hugging Face token
# Flag to track if backend is available
BACKEND_AVAILABLE = False
backend_client = None
def initialize_backend():
"""Initialize backend connection using gradio_client"""
global backend_client, BACKEND_AVAILABLE
try:
print(f"Attempting to connect to backend: {BACKEND_SPACE_URL}")
# Use gradio_client for proper API access
from gradio_client import Client
backend_client = Client(f"https://huggingface.co/spaces/{BACKEND_SPACE_URL}", hf_token=hf_token)
print(f"✅ Backend connection successful!")
print(f"🔧 Backend client: {backend_client}")
BACKEND_AVAILABLE = True
return True
except Exception as e:
print(f"❌ Backend connection failed: {e}")
print("⚠️ Running in standalone mode (some features may be limited)")
BACKEND_AVAILABLE = False
return False
def numpy_to_base64(arr):
"""Convert numpy array to base64 string"""
return base64.b64encode(arr.tobytes()).decode('utf-8')
def base64_to_numpy(b64_str, shape, dtype):
"""Convert base64 string back to numpy array"""
return np.frombuffer(base64.b64decode(b64_str), dtype=dtype).reshape(shape)
def base64_to_image(b64_str):
"""Convert base64 string to numpy image array"""
if not b64_str:
return None
try:
# Decode base64 to bytes
img_bytes = base64.b64decode(b64_str)
# Convert bytes to numpy array
nparr = np.frombuffer(img_bytes, np.uint8)
# Decode image
img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
# Convert BGR to RGB
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
return img
except Exception as e:
print(f"Error converting base64 to image: {e}")
return None
def get_video_name(video_path):
"""Extract video name without extension"""
return os.path.splitext(os.path.basename(video_path))[0]
def extract_first_frame(video_path):
"""Extract first frame from video file"""
try:
cap = cv2.VideoCapture(video_path)
ret, frame = cap.read()
cap.release()
if ret:
# Convert BGR to RGB
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
return frame_rgb
else:
return None
except Exception as e:
print(f"Error extracting first frame: {e}")
return None
def handle_video_upload(video):
"""Handle video upload and extract first frame"""
if video is None:
return None, None, [], 50, 756, 3
try:
if BACKEND_AVAILABLE and backend_client:
# Try to use backend API
try:
print("🔧 Calling backend API for video upload...")
# Call the unified API with upload_video function type
result = backend_client.predict(
"upload_video", # function_type
video, # video file
"", # original_image_state (not used for upload)
[], # selected_points (not used for upload)
"positive_point", # point_type (not used for upload)
0, # point_x (not used for upload)
0, # point_y (not used for upload)
50, # grid_size (not used for upload)
756, # vo_points (not used for upload)
3, # fps (not used for upload)
api_name="/predict"
)
print(f"✅ Backend video upload API call successful!")
print(f"🔧 Result type: {type(result)}")
print(f"🔧 Result: {result}")
# Parse the result - expect a dict with success status
if isinstance(result, dict) and result.get("success"):
# Extract data from backend response
original_image_state = result.get("original_image_state", "")
display_image = result.get("display_image", None)
selected_points = result.get("selected_points", [])
# Get video settings based on video name
video_name = get_video_name(video)
grid_size_val, vo_points_val, fps_val = get_video_settings(video_name)
return original_image_state, display_image, selected_points, grid_size_val, vo_points_val, fps_val
else:
print("Backend processing failed, using local fallback")
# Fallback to local processing
pass
except Exception as e:
print(f"Backend API call failed: {e}")
# Fallback to local processing
pass
# Fallback: local processing
print("Using local video processing...")
display_image = extract_first_frame(video)
# Create a simple state representation
original_image_state = json.dumps({
"video_path": video,
"frame": "local_processing"
})
# Get video settings
video_name = get_video_name(video)
grid_size_val, vo_points_val, fps_val = get_video_settings(video_name)
return original_image_state, display_image, [], grid_size_val, vo_points_val, fps_val
except Exception as e:
print(f"Error in handle_video_upload: {e}")
return None, None, [], 50, 756, 3
def select_point(original_img: str, sel_pix: list, point_type: str, evt: gr.SelectData):
"""Handle point selection for SAM"""
if original_img is None:
return None, []
try:
if BACKEND_AVAILABLE and backend_client:
# Try to use backend API
try:
print(f"🔧 Calling backend select point API: x={evt.index[0]}, y={evt.index[1]}, type={point_type}")
# Call the unified API with select_point function type
result = backend_client.predict(
"select_point", # function_type
None, # video file (not used for select_point)
original_img, # original_image_state
sel_pix, # selected_points
point_type, # point_type
evt.index[0], # point_x
evt.index[1], # point_y
50, # grid_size (not used for select_point)
756, # vo_points (not used for select_point)
3, # fps (not used for select_point)
api_name="/predict"
)
print(f"✅ Backend select point API call successful!")
print(f"🔧 Result type: {type(result)}")
print(f"🔧 Result: {result}")
# Parse the result - expect a dict with success status
if isinstance(result, dict) and result.get("success"):
display_image = result.get("display_image", None)
new_sel_pix = result.get("selected_points", sel_pix)
return display_image, new_sel_pix
else:
print("Backend processing failed, using local fallback")
# Fallback to local processing
pass
except Exception as e:
print(f"Backend API call failed: {e}")
# Fallback to local processing
pass
# Fallback: local processing with improved visualization
print("Using local point selection with enhanced visualization...")
# Parse original image state
try:
state_data = json.loads(original_img)
video_path = state_data.get("video_path")
except:
video_path = None
if video_path:
# Re-extract frame and add point with mask visualization
display_image = extract_first_frame(video_path)
if display_image is not None:
# Add point to the image with enhanced visualization
x, y = evt.index[0], evt.index[1]
color = (0, 255, 0) if point_type == 'positive_point' else (255, 0, 0)
# Draw a larger, more visible point
cv2.circle(display_image, (x, y), 8, color, -1)
cv2.circle(display_image, (x, y), 12, (255, 255, 255), 2)
# Add point to selected points list
new_sel_pix = sel_pix.copy()
new_sel_pix.append([x, y, point_type])
return display_image, new_sel_pix
return None, []
except Exception as e:
print(f"Error in select_point: {e}")
return None, []
def reset_points(original_img: str, sel_pix):
"""Reset points and restore original image"""
if original_img is None:
return None, []
try:
if BACKEND_AVAILABLE and backend_client:
# Try to use backend API
try:
print("🔧 Calling backend reset points API...")
# Call the unified API with reset_points function type
result = backend_client.predict(
"reset_points", # function_type
None, # video file (not used for reset_points)
original_img, # original_image_state
sel_pix, # selected_points
"positive_point", # point_type (not used for reset_points)
0, # point_x (not used for reset_points)
0, # point_y (not used for reset_points)
50, # grid_size (not used for reset_points)
756, # vo_points (not used for reset_points)
3, # fps (not used for reset_points)
api_name="/predict"
)
print(f"✅ Backend reset points API call successful!")
print(f"🔧 Result: {result}")
# Parse the result
if isinstance(result, dict) and result.get("success"):
display_image = result.get("display_image", None)
new_sel_pix = result.get("selected_points", [])
return display_image, new_sel_pix
else:
print("Backend processing failed, using local fallback")
# Fallback to local processing
pass
except Exception as e:
print(f"Backend API call failed: {e}")
# Fallback to local processing
pass
# Fallback: local processing
print("Using local reset points...")
# Parse original image state
try:
state_data = json.loads(original_img)
video_path = state_data.get("video_path")
except:
video_path = None
if video_path:
# Re-extract original frame
display_image = extract_first_frame(video_path)
return display_image, []
return None, []
except Exception as e:
print(f"Error in reset_points: {e}")
return None, []
def launch_viz(grid_size, vo_points, fps, original_image_state):
"""Launch visualization with user-specific temp directory"""
if original_image_state is None:
return None, None
try:
if BACKEND_AVAILABLE and backend_client:
# Try to use backend API
try:
print(f"🔧 Calling backend API with parameters: grid_size={grid_size}, vo_points={vo_points}, fps={fps}")
print(f"🔧 Original image state type: {type(original_image_state)}")
print(f"🔧 Original image state preview: {str(original_image_state)[:100]}...")
# Call the unified API with run_tracker function type
result = backend_client.predict(
"run_tracker", # function_type
None, # video file (not used for run_tracker)
original_image_state, # original_image_state
[], # selected_points (not used for run_tracker)
"positive_point", # point_type (not used for run_tracker)
0, # point_x (not used for run_tracker)
0, # point_y (not used for run_tracker)
grid_size, # grid_size
vo_points, # vo_points
fps, # fps
api_name="/predict"
)
print(f"✅ Backend API call successful!")
print(f"🔧 Result type: {type(result)}")
print(f"🔧 Result: {result}")
# Parse the result
if isinstance(result, dict) and result.get("success"):
viz_html = result.get("viz_html", "")
track_video_path = result.get("track_video_path", "")
return viz_html, track_video_path
else:
print("Backend processing failed, showing error message")
# Fallback to error message
pass
except Exception as e:
print(f"❌ Backend API call failed: {e}")
print(f"🔧 Error type: {type(e)}")
print(f"🔧 Error details: {str(e)}")
# Fallback to local processing
pass
# Fallback: show message that backend is required
error_message = f"""
<div style='border: 3px solid #ff6b6b; border-radius: 10px; padding: 20px; background-color: #fff5f5;'>
<h3 style='color: #d63031; margin-bottom: 15px;'>⚠️ Backend Connection Required</h3>
<p style='color: #2d3436; line-height: 1.6;'>
The tracking and visualization features require a connection to the backend Space.
Please ensure:
</p>
<ul style='color: #2d3436; line-height: 1.6;'>
<li>The backend Space is deployed and running</li>
<li>The BACKEND_SPACE_URL is correctly configured</li>
<li>You have proper access permissions to the backend Space</li>
</ul>
<div style='background-color: #f8f9fa; border-radius: 5px; padding: 10px; margin-top: 10px;'>
<p style='color: #2d3436; font-weight: bold; margin: 0 0 5px 0;'>Debug Information:</p>
<p style='color: #666; font-size: 12px; margin: 0;'>Backend Available: {BACKEND_AVAILABLE}</p>
<p style='color: #666; font-size: 12px; margin: 0;'>Backend Client: {backend_client is not None}</p>
<p style='color: #666; font-size: 12px; margin: 0;'>Backend URL: {BACKEND_SPACE_URL}</p>
<p style='color: #666; font-size: 12px; margin: 0;'>Client Type: {type(backend_client) if backend_client else 'None'}</p>
</div>
<p style='color: #2d3436; font-weight: bold; margin-top: 15px;'>
Current Status: Backend unavailable - Running in limited mode
</p>
</div>
"""
return error_message, None
except Exception as e:
print(f"Error in launch_viz: {e}")
return None, None
def clear_all():
"""Clear all buffers and temporary files"""
return None, None, [], 50, 756, 3
def update_tracker_model(model_name):
"""Update tracker model (placeholder function)"""
return
def get_video_settings(video_name):
"""Get video-specific settings based on video name"""
video_settings = {
"kiss": (45, 700, 10),
"backpack": (40, 600, 2),
"kitchen": (60, 800, 3),
"pillow": (35, 500, 2),
"hockey": (45, 700, 2),
"drifting": (35, 1000, 6),
"ball": (45, 256, 6),
"ken_block_0": (45, 700, 2),
"ego_kc1": (45, 500, 4),
"vertical_place": (45, 500, 3),
"ego_teaser": (45, 1200, 10),
"robot_unitree": (45, 500, 4),
"droid_robot": (35, 400, 5),
"robot_2": (45, 256, 5),
"cinema_0": (45, 356, 5),
"cinema_1": (45, 756, 3),
}
return video_settings.get(video_name, (50, 756, 3))
def test_backend_connection():
"""Test if backend is actually working"""
global BACKEND_AVAILABLE
if not backend_client:
return False
try:
print("Testing backend connection with a simple call...")
# Check if we have fns available
if hasattr(backend_client, 'fns') and backend_client.fns:
print("✅ Backend API functions are available")
print(f"🔧 Available function indices: {list(backend_client.fns.keys())}")
return True
else:
print("❌ Backend API functions not found")
return False
except Exception as e:
print(f"❌ Backend connection test failed: {e}")
return False
def test_backend_api():
"""Test specific backend API functions"""
if not BACKEND_AVAILABLE or not backend_client:
print("❌ Backend not available for testing")
return False
try:
print("🧪 Testing backend API functions...")
# Test if fns exist and show available indices
if hasattr(backend_client, 'fns') and backend_client.fns:
print(f"✅ Backend has {len(backend_client.fns)} functions available")
for idx in backend_client.fns.keys():
print(f"✅ Function {idx} is available")
else:
print("❌ No functions found in backend API")
return False
return True
except Exception as e:
print(f"❌ Backend API test failed: {e}")
return False
# Initialize the backend connection
print("🚀 Initializing frontend application...")
initialize_backend()
# Test backend connection if available
if BACKEND_AVAILABLE:
print("🧪 Testing backend connection...")
test_result = test_backend_connection()
if test_result:
print("✅ Backend connection test passed!")
test_backend_api()
else:
print("❌ Backend connection test failed!")
BACKEND_AVAILABLE = False
# Create the Gradio interface
print("🎨 Creating Gradio interface...")
with gr.Blocks(
theme=gr.themes.Soft(),
title="SpatialTracker V2 - Frontend",
css="""
.gradio-container {
max-width: 1200px !important;
margin: auto !important;
}
.gr-button {
margin: 5px;
}
.gr-form {
background: white;
border-radius: 10px;
padding: 20px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
}
"""
) as demo:
gr.Markdown("""
# 🎯 SpatialTracker V2 - Frontend Interface
Welcome to SpatialTracker V2! This interface allows you to track objects in videos using advanced computer vision techniques.
**Instructions:**
1. Upload a video file or select from examples below
2. Click on the object you want to track in the first frame
3. Adjust tracking parameters if needed
4. Click "Launch Visualization" to start tracking
""")
# Status indicator
status_text = "🟢 Backend Connected" if BACKEND_AVAILABLE else "🟡 Running in Standalone Mode"
gr.Markdown(f"**Status:** {status_text}")
# Example videos section - moved to top
with gr.Group():
gr.Markdown("### 📂 Example Videos")
gr.Markdown("Try these example videos to get started quickly:")
# Define video_input here so it can be referenced in examples
video_input = gr.Video(
label="Upload Video or Select Example",
format="mp4"
)
gr.Examples(
examples=[
["examples/kiss.mp4"],
["examples/backpack.mp4"],
["examples/kitchen.mp4"],
["examples/pillow.mp4"],
["examples/hockey.mp4"],
["examples/drifting.mp4"],
["examples/ball.mp4"],
["examples/ken_block_0.mp4"],
["examples/ego_kc1.mp4"],
["examples/vertical_place.mp4"],
["examples/ego_teaser.mp4"],
["examples/robot_unitree.mp4"],
["examples/droid_robot.mp4"],
["examples/robot_2.mp4"],
["examples/cinema_0.mp4"],
["examples/cinema_1.mp4"],
],
inputs=video_input,
label="Click on any example to load it"
)
with gr.Row():
with gr.Column(scale=1):
# Interactive frame display
with gr.Group():
gr.Markdown("### 🎯 Point Selection")
gr.Markdown("Click on the object you want to track in the frame below:")
interactive_frame = gr.Image(
label="Click to select tracking points",
type="numpy",
interactive=True
)
with gr.Row():
point_type = gr.Radio(
choices=["positive_point", "negative_point"],
value="positive_point",
label="Point Type",
info="Positive points indicate the object to track, negative points indicate areas to avoid"
)
with gr.Row():
reset_points_btn = gr.Button("🔄 Reset Points", variant="secondary")
clear_all_btn = gr.Button("🗑️ Clear All", variant="stop")
with gr.Column(scale=1):
# Tracking results
with gr.Group():
gr.Markdown("### 🎬 Tracking Results")
tracking_result_video = gr.Video(
label="Tracking Result Video",
interactive=False
)
# 3D Visualization
with gr.Group():
gr.Markdown("### 🌐 3D Visualization")
viz_html = gr.HTML(
label="3D Trajectory Visualization",
value="<p>Upload a video and select points to see 3D visualization here.</p>"
)
# Advanced settings section - changed to open=True
with gr.Accordion("⚙️ Advanced Settings", open=True):
gr.Markdown("Adjust these parameters to optimize tracking performance:")
with gr.Row():
grid_size = gr.Slider(
minimum=10,
maximum=100,
step=10,
value=50,
label="Grid Size",
info="Size of the tracking grid (larger = more detailed)"
)
vo_points = gr.Slider(
minimum=100,
maximum=2000,
step=50,
value=756,
label="VO Points",
info="Number of visual odometry points (more = better accuracy)"
)
fps = gr.Slider(
minimum=1,
maximum=30,
step=1,
value=3,
label="FPS",
info="Frames per second for processing (higher = smoother but slower)"
)
# Launch button
with gr.Row():
launch_btn = gr.Button("🚀 Launch Visualization", variant="primary", size="lg")
# Hidden state variables
original_image_state = gr.State(None)
selected_points = gr.State([])
# Event handlers
video_input.change(
fn=handle_video_upload,
inputs=[video_input],
outputs=[original_image_state, interactive_frame, selected_points, grid_size, vo_points, fps]
)
interactive_frame.select(
fn=select_point,
inputs=[original_image_state, selected_points, point_type],
outputs=[interactive_frame, selected_points]
)
reset_points_btn.click(
fn=reset_points,
inputs=[original_image_state, selected_points],
outputs=[interactive_frame, selected_points]
)
clear_all_btn.click(
fn=clear_all,
outputs=[video_input, interactive_frame, selected_points, grid_size, vo_points, fps]
)
launch_btn.click(
fn=launch_viz,
inputs=[grid_size, vo_points, fps, original_image_state],
outputs=[viz_html, tracking_result_video]
)
# Launch the interface
if __name__ == "__main__":
print("🌟 Launching SpatialTracker V2 Frontend...")
print(f"🔗 Backend Status: {'Connected' if BACKEND_AVAILABLE else 'Disconnected'}")
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=True,
debug=True,
show_error=True
)