Spaces:

Yuxihenry
/

SpatialTrackerV2

Running on Zero

SpatialTrackerV2 / app.py

xiaoyuxi

backend

b6d15ed 6 months ago

26 kB

	import gradio as gr
	import os
	import json
	import numpy as np
	import cv2
	import base64
	from typing import List, Tuple

	# Backend Space URL - replace with your actual backend space URL
	BACKEND_SPACE_URL = "Yuxihenry/SpatialTrackerV2_Backend" # Replace with actual backend space URL
	hf_token = os.getenv("HF_TOKEN") # Replace with your actual Hugging Face token

	# Flag to track if backend is available
	BACKEND_AVAILABLE = False
	backend_client = None

	def initialize_backend():
	"""Initialize backend connection using gradio_client"""
	global backend_client, BACKEND_AVAILABLE
	try:
	print(f"Attempting to connect to backend: {BACKEND_SPACE_URL}")

	# Use gradio_client for proper API access
	from gradio_client import Client
	backend_client = Client(f"https://huggingface.co/spaces/{BACKEND_SPACE_URL}", hf_token=hf_token)

	print(f"✅ Backend connection successful!")
	print(f"🔧 Backend client: {backend_client}")

	BACKEND_AVAILABLE = True
	return True

	except Exception as e:
	print(f"❌ Backend connection failed: {e}")
	print("⚠️ Running in standalone mode (some features may be limited)")
	BACKEND_AVAILABLE = False
	return False

	def numpy_to_base64(arr):
	"""Convert numpy array to base64 string"""
	return base64.b64encode(arr.tobytes()).decode('utf-8')

	def base64_to_numpy(b64_str, shape, dtype):
	"""Convert base64 string back to numpy array"""
	return np.frombuffer(base64.b64decode(b64_str), dtype=dtype).reshape(shape)

	def base64_to_image(b64_str):
	"""Convert base64 string to numpy image array"""
	if not b64_str:
	return None
	try:
	# Decode base64 to bytes
	img_bytes = base64.b64decode(b64_str)
	# Convert bytes to numpy array
	nparr = np.frombuffer(img_bytes, np.uint8)
	# Decode image
	img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
	# Convert BGR to RGB
	img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
	return img
	except Exception as e:
	print(f"Error converting base64 to image: {e}")
	return None

	def get_video_name(video_path):
	"""Extract video name without extension"""
	return os.path.splitext(os.path.basename(video_path))[0]

	def extract_first_frame(video_path):
	"""Extract first frame from video file"""
	try:
	cap = cv2.VideoCapture(video_path)
	ret, frame = cap.read()
	cap.release()

	if ret:
	# Convert BGR to RGB
	frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	return frame_rgb
	else:
	return None
	except Exception as e:
	print(f"Error extracting first frame: {e}")
	return None

	def handle_video_upload(video):
	"""Handle video upload and extract first frame"""
	if video is None:
	return None, None, [], 50, 756, 3

	try:
	if BACKEND_AVAILABLE and backend_client:
	# Try to use backend API
	try:
	print("🔧 Calling backend API for video upload...")

	# Call the unified API with upload_video function type
	result = backend_client.predict(
	"upload_video", # function_type
	video, # video file
	"", # original_image_state (not used for upload)
	[], # selected_points (not used for upload)
	"positive_point", # point_type (not used for upload)
	0, # point_x (not used for upload)
	0, # point_y (not used for upload)
	50, # grid_size (not used for upload)
	756, # vo_points (not used for upload)
	3, # fps (not used for upload)
	api_name="/predict"
	)

	print(f"✅ Backend video upload API call successful!")
	print(f"🔧 Result type: {type(result)}")
	print(f"🔧 Result: {result}")

	# Parse the result - expect a dict with success status
	if isinstance(result, dict) and result.get("success"):
	# Extract data from backend response
	original_image_state = result.get("original_image_state", "")
	display_image = result.get("display_image", None)
	selected_points = result.get("selected_points", [])

	# Get video settings based on video name
	video_name = get_video_name(video)
	grid_size_val, vo_points_val, fps_val = get_video_settings(video_name)

	return original_image_state, display_image, selected_points, grid_size_val, vo_points_val, fps_val
	else:
	print("Backend processing failed, using local fallback")
	# Fallback to local processing
	pass
	except Exception as e:
	print(f"Backend API call failed: {e}")
	# Fallback to local processing
	pass

	# Fallback: local processing
	print("Using local video processing...")
	display_image = extract_first_frame(video)

	# Create a simple state representation
	original_image_state = json.dumps({
	"video_path": video,
	"frame": "local_processing"
	})

	# Get video settings
	video_name = get_video_name(video)
	grid_size_val, vo_points_val, fps_val = get_video_settings(video_name)

	return original_image_state, display_image, [], grid_size_val, vo_points_val, fps_val

	except Exception as e:
	print(f"Error in handle_video_upload: {e}")
	return None, None, [], 50, 756, 3

	def select_point(original_img: str, sel_pix: list, point_type: str, evt: gr.SelectData):
	"""Handle point selection for SAM"""
	if original_img is None:
	return None, []

	try:
	if BACKEND_AVAILABLE and backend_client:
	# Try to use backend API
	try:
	print(f"🔧 Calling backend select point API: x={evt.index[0]}, y={evt.index[1]}, type={point_type}")

	# Call the unified API with select_point function type
	result = backend_client.predict(
	"select_point", # function_type
	None, # video file (not used for select_point)
	original_img, # original_image_state
	sel_pix, # selected_points
	point_type, # point_type
	evt.index[0], # point_x
	evt.index[1], # point_y
	50, # grid_size (not used for select_point)
	756, # vo_points (not used for select_point)
	3, # fps (not used for select_point)
	api_name="/predict"
	)

	print(f"✅ Backend select point API call successful!")
	print(f"🔧 Result type: {type(result)}")
	print(f"🔧 Result: {result}")

	# Parse the result - expect a dict with success status
	if isinstance(result, dict) and result.get("success"):
	display_image = result.get("display_image", None)
	new_sel_pix = result.get("selected_points", sel_pix)
	return display_image, new_sel_pix
	else:
	print("Backend processing failed, using local fallback")
	# Fallback to local processing
	pass
	except Exception as e:
	print(f"Backend API call failed: {e}")
	# Fallback to local processing
	pass

	# Fallback: local processing with improved visualization
	print("Using local point selection with enhanced visualization...")

	# Parse original image state
	try:
	state_data = json.loads(original_img)
	video_path = state_data.get("video_path")
	except:
	video_path = None

	if video_path:
	# Re-extract frame and add point with mask visualization
	display_image = extract_first_frame(video_path)
	if display_image is not None:
	# Add point to the image with enhanced visualization
	x, y = evt.index[0], evt.index[1]
	color = (0, 255, 0) if point_type == 'positive_point' else (255, 0, 0)

	# Draw a larger, more visible point
	cv2.circle(display_image, (x, y), 8, color, -1)
	cv2.circle(display_image, (x, y), 12, (255, 255, 255), 2)

	# Add point to selected points list
	new_sel_pix = sel_pix.copy()
	new_sel_pix.append([x, y, point_type])

	return display_image, new_sel_pix

	return None, []

	except Exception as e:
	print(f"Error in select_point: {e}")
	return None, []

	def reset_points(original_img: str, sel_pix):
	"""Reset points and restore original image"""
	if original_img is None:
	return None, []

	try:
	if BACKEND_AVAILABLE and backend_client:
	# Try to use backend API
	try:
	print("🔧 Calling backend reset points API...")

	# Call the unified API with reset_points function type
	result = backend_client.predict(
	"reset_points", # function_type
	None, # video file (not used for reset_points)
	original_img, # original_image_state
	sel_pix, # selected_points
	"positive_point", # point_type (not used for reset_points)
	0, # point_x (not used for reset_points)
	0, # point_y (not used for reset_points)
	50, # grid_size (not used for reset_points)
	756, # vo_points (not used for reset_points)
	3, # fps (not used for reset_points)
	api_name="/predict"
	)

	print(f"✅ Backend reset points API call successful!")
	print(f"🔧 Result: {result}")

	# Parse the result
	if isinstance(result, dict) and result.get("success"):
	display_image = result.get("display_image", None)
	new_sel_pix = result.get("selected_points", [])
	return display_image, new_sel_pix
	else:
	print("Backend processing failed, using local fallback")
	# Fallback to local processing
	pass
	except Exception as e:
	print(f"Backend API call failed: {e}")
	# Fallback to local processing
	pass

	# Fallback: local processing
	print("Using local reset points...")

	# Parse original image state
	try:
	state_data = json.loads(original_img)
	video_path = state_data.get("video_path")
	except:
	video_path = None

	if video_path:
	# Re-extract original frame
	display_image = extract_first_frame(video_path)
	return display_image, []

	return None, []

	except Exception as e:
	print(f"Error in reset_points: {e}")
	return None, []

	def launch_viz(grid_size, vo_points, fps, original_image_state):
	"""Launch visualization with user-specific temp directory"""
	if original_image_state is None:
	return None, None

	try:
	if BACKEND_AVAILABLE and backend_client:
	# Try to use backend API
	try:
	print(f"🔧 Calling backend API with parameters: grid_size={grid_size}, vo_points={vo_points}, fps={fps}")
	print(f"🔧 Original image state type: {type(original_image_state)}")
	print(f"🔧 Original image state preview: {str(original_image_state)[:100]}...")

	# Call the unified API with run_tracker function type
	result = backend_client.predict(
	"run_tracker", # function_type
	None, # video file (not used for run_tracker)
	original_image_state, # original_image_state
	[], # selected_points (not used for run_tracker)
	"positive_point", # point_type (not used for run_tracker)
	0, # point_x (not used for run_tracker)
	0, # point_y (not used for run_tracker)
	grid_size, # grid_size
	vo_points, # vo_points
	fps, # fps
	api_name="/predict"
	)

	print(f"✅ Backend API call successful!")
	print(f"🔧 Result type: {type(result)}")
	print(f"🔧 Result: {result}")

	# Parse the result
	if isinstance(result, dict) and result.get("success"):
	viz_html = result.get("viz_html", "")
	track_video_path = result.get("track_video_path", "")
	return viz_html, track_video_path
	else:
	print("Backend processing failed, showing error message")
	# Fallback to error message
	pass
	except Exception as e:
	print(f"❌ Backend API call failed: {e}")
	print(f"🔧 Error type: {type(e)}")
	print(f"🔧 Error details: {str(e)}")
	# Fallback to local processing
	pass

	# Fallback: show message that backend is required
	error_message = f"""
	<div style='border: 3px solid #ff6b6b; border-radius: 10px; padding: 20px; background-color: #fff5f5;'>
	<h3 style='color: #d63031; margin-bottom: 15px;'>⚠️ Backend Connection Required</h3>
	<p style='color: #2d3436; line-height: 1.6;'>
	The tracking and visualization features require a connection to the backend Space.
	Please ensure:
	</p>
	<ul style='color: #2d3436; line-height: 1.6;'>
	<li>The backend Space is deployed and running</li>
	<li>The BACKEND_SPACE_URL is correctly configured</li>
	<li>You have proper access permissions to the backend Space</li>
	</ul>
	<div style='background-color: #f8f9fa; border-radius: 5px; padding: 10px; margin-top: 10px;'>
	<p style='color: #2d3436; font-weight: bold; margin: 0 0 5px 0;'>Debug Information:</p>
	<p style='color: #666; font-size: 12px; margin: 0;'>Backend Available: {BACKEND_AVAILABLE}</p>
	<p style='color: #666; font-size: 12px; margin: 0;'>Backend Client: {backend_client is not None}</p>
	<p style='color: #666; font-size: 12px; margin: 0;'>Backend URL: {BACKEND_SPACE_URL}</p>
	<p style='color: #666; font-size: 12px; margin: 0;'>Client Type: {type(backend_client) if backend_client else 'None'}</p>
	</div>
	<p style='color: #2d3436; font-weight: bold; margin-top: 15px;'>
	Current Status: Backend unavailable - Running in limited mode
	</p>
	</div>
	"""
	return error_message, None

	except Exception as e:
	print(f"Error in launch_viz: {e}")
	return None, None

	def clear_all():
	"""Clear all buffers and temporary files"""
	return None, None, [], 50, 756, 3

	def update_tracker_model(model_name):
	"""Update tracker model (placeholder function)"""
	return

	def get_video_settings(video_name):
	"""Get video-specific settings based on video name"""
	video_settings = {
	"kiss": (45, 700, 10),
	"backpack": (40, 600, 2),
	"kitchen": (60, 800, 3),
	"pillow": (35, 500, 2),
	"hockey": (45, 700, 2),
	"drifting": (35, 1000, 6),
	"ball": (45, 256, 6),
	"ken_block_0": (45, 700, 2),
	"ego_kc1": (45, 500, 4),
	"vertical_place": (45, 500, 3),
	"ego_teaser": (45, 1200, 10),
	"robot_unitree": (45, 500, 4),
	"droid_robot": (35, 400, 5),
	"robot_2": (45, 256, 5),
	"cinema_0": (45, 356, 5),
	"cinema_1": (45, 756, 3),
	}

	return video_settings.get(video_name, (50, 756, 3))

	def test_backend_connection():
	"""Test if backend is actually working"""
	global BACKEND_AVAILABLE
	if not backend_client:
	return False

	try:
	print("Testing backend connection with a simple call...")
	# Check if we have fns available
	if hasattr(backend_client, 'fns') and backend_client.fns:
	print("✅ Backend API functions are available")
	print(f"🔧 Available function indices: {list(backend_client.fns.keys())}")
	return True
	else:
	print("❌ Backend API functions not found")
	return False
	except Exception as e:
	print(f"❌ Backend connection test failed: {e}")
	return False

	def test_backend_api():
	"""Test specific backend API functions"""
	if not BACKEND_AVAILABLE or not backend_client:
	print("❌ Backend not available for testing")
	return False

	try:
	print("🧪 Testing backend API functions...")

	# Test if fns exist and show available indices
	if hasattr(backend_client, 'fns') and backend_client.fns:
	print(f"✅ Backend has {len(backend_client.fns)} functions available")
	for idx in backend_client.fns.keys():
	print(f"✅ Function {idx} is available")
	else:
	print("❌ No functions found in backend API")
	return False

	return True

	except Exception as e:
	print(f"❌ Backend API test failed: {e}")
	return False

	# Initialize the backend connection
	print("🚀 Initializing frontend application...")
	initialize_backend()

	# Test backend connection if available
	if BACKEND_AVAILABLE:
	print("🧪 Testing backend connection...")
	test_result = test_backend_connection()
	if test_result:
	print("✅ Backend connection test passed!")
	test_backend_api()
	else:
	print("❌ Backend connection test failed!")
	BACKEND_AVAILABLE = False

	# Create the Gradio interface
	print("🎨 Creating Gradio interface...")

	with gr.Blocks(
	theme=gr.themes.Soft(),
	title="SpatialTracker V2 - Frontend",
	css="""
	.gradio-container {
	max-width: 1200px !important;
	margin: auto !important;
	}
	.gr-button {
	margin: 5px;
	}
	.gr-form {
	background: white;
	border-radius: 10px;
	padding: 20px;
	box-shadow: 0 2px 10px rgba(0,0,0,0.1);
	}
	"""
	) as demo:

	gr.Markdown("""
	# 🎯 SpatialTracker V2 - Frontend Interface

	Welcome to SpatialTracker V2! This interface allows you to track objects in videos using advanced computer vision techniques.

	Instructions:
	1. Upload a video file or select from examples below
	2. Click on the object you want to track in the first frame
	3. Adjust tracking parameters if needed
	4. Click "Launch Visualization" to start tracking

	""")

	# Status indicator
	status_text = "🟢 Backend Connected" if BACKEND_AVAILABLE else "🟡 Running in Standalone Mode"
	gr.Markdown(f"Status: {status_text}")

	# Example videos section - moved to top
	with gr.Group():
	gr.Markdown("### 📂 Example Videos")
	gr.Markdown("Try these example videos to get started quickly:")

	# Define video_input here so it can be referenced in examples
	video_input = gr.Video(
	label="Upload Video or Select Example",
	format="mp4"
	)

	gr.Examples(
	examples=[
	["examples/kiss.mp4"],
	["examples/backpack.mp4"],
	["examples/kitchen.mp4"],
	["examples/pillow.mp4"],
	["examples/hockey.mp4"],
	["examples/drifting.mp4"],
	["examples/ball.mp4"],
	["examples/ken_block_0.mp4"],
	["examples/ego_kc1.mp4"],
	["examples/vertical_place.mp4"],
	["examples/ego_teaser.mp4"],
	["examples/robot_unitree.mp4"],
	["examples/droid_robot.mp4"],
	["examples/robot_2.mp4"],
	["examples/cinema_0.mp4"],
	["examples/cinema_1.mp4"],
	],
	inputs=video_input,
	label="Click on any example to load it"
	)

	with gr.Row():
	with gr.Column(scale=1):
	# Interactive frame display
	with gr.Group():
	gr.Markdown("### 🎯 Point Selection")
	gr.Markdown("Click on the object you want to track in the frame below:")

	interactive_frame = gr.Image(
	label="Click to select tracking points",
	type="numpy",
	interactive=True
	)

	with gr.Row():
	point_type = gr.Radio(
	choices=["positive_point", "negative_point"],
	value="positive_point",
	label="Point Type",
	info="Positive points indicate the object to track, negative points indicate areas to avoid"
	)

	with gr.Row():
	reset_points_btn = gr.Button("🔄 Reset Points", variant="secondary")
	clear_all_btn = gr.Button("🗑️ Clear All", variant="stop")

	with gr.Column(scale=1):
	# Tracking results
	with gr.Group():
	gr.Markdown("### 🎬 Tracking Results")
	tracking_result_video = gr.Video(
	label="Tracking Result Video",
	interactive=False
	)

	# 3D Visualization
	with gr.Group():
	gr.Markdown("### 🌐 3D Visualization")
	viz_html = gr.HTML(
	label="3D Trajectory Visualization",
	value="<p>Upload a video and select points to see 3D visualization here.</p>"
	)

	# Advanced settings section - changed to open=True
	with gr.Accordion("⚙️ Advanced Settings", open=True):
	gr.Markdown("Adjust these parameters to optimize tracking performance:")
	with gr.Row():
	grid_size = gr.Slider(
	minimum=10,
	maximum=100,
	step=10,
	value=50,
	label="Grid Size",
	info="Size of the tracking grid (larger = more detailed)"
	)
	vo_points = gr.Slider(
	minimum=100,
	maximum=2000,
	step=50,
	value=756,
	label="VO Points",
	info="Number of visual odometry points (more = better accuracy)"
	)
	fps = gr.Slider(
	minimum=1,
	maximum=30,
	step=1,
	value=3,
	label="FPS",
	info="Frames per second for processing (higher = smoother but slower)"
	)

	# Launch button
	with gr.Row():
	launch_btn = gr.Button("🚀 Launch Visualization", variant="primary", size="lg")

	# Hidden state variables
	original_image_state = gr.State(None)
	selected_points = gr.State([])

	# Event handlers
	video_input.change(
	fn=handle_video_upload,
	inputs=[video_input],
	outputs=[original_image_state, interactive_frame, selected_points, grid_size, vo_points, fps]
	)

	interactive_frame.select(
	fn=select_point,
	inputs=[original_image_state, selected_points, point_type],
	outputs=[interactive_frame, selected_points]
	)

	reset_points_btn.click(
	fn=reset_points,
	inputs=[original_image_state, selected_points],
	outputs=[interactive_frame, selected_points]
	)

	clear_all_btn.click(
	fn=clear_all,
	outputs=[video_input, interactive_frame, selected_points, grid_size, vo_points, fps]
	)

	launch_btn.click(
	fn=launch_viz,
	inputs=[grid_size, vo_points, fps, original_image_state],
	outputs=[viz_html, tracking_result_video]
	)

	# Launch the interface
	if __name__ == "__main__":
	print("🌟 Launching SpatialTracker V2 Frontend...")
	print(f"🔗 Backend Status: {'Connected' if BACKEND_AVAILABLE else 'Disconnected'}")

	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=True,
	debug=True,
	show_error=True
	)