import pandas as pd from typing import Optional # Mock Helper Functions def get_rank_badge(rank): colors = {1: "#FFD700", 2: "#C0C0C0", 3: "#CD7F32"} color = colors.get(rank, "#64748B") return f'{rank}' def get_success_rate_bar(rate): if pd.isna(rate): return "N/A" color = "#10B981" if rate >= 75 else "#F59E0B" if rate >= 50 else "#EF4444" return f"""
{rate:.1f}%
""" def get_gpu_utilization_bar(util): return get_success_rate_bar(util) def get_provider_badge(provider): color = "#4F46E5" if provider == "litellm" else "#14B8A6" return f'{provider}' def get_agent_type_badge(agent_type): color = "#0EA5E9" return f'{agent_type}' def get_hardware_badge(has_gpu): label = "GPU" if has_gpu else "CPU" color = "#10B981" if has_gpu else "#F59E0B" return f'{label}' def format_cost(cost): if pd.isna(cost): return "N/A" return f"${cost:.4f}" def format_duration(ms): if pd.isna(ms): return "N/A" return f"{ms / 1000:.2f}s" def generate_leaderboard_html( df: pd.DataFrame, sort_by: str = "success_rate", ascending: bool = False ) -> str: """ Generates a styled HTML table for the leaderboard. Args: df: The leaderboard DataFrame. sort_by: The column to sort the DataFrame by. ascending: The sort order. Returns: A string containing the complete HTML for the table. """ df_sorted = df.sort_values(by=sort_by, ascending=ascending).reset_index(drop=True) html = """
""" for idx, row in df_sorted.iterrows(): rank = idx + 1 model = row.get("model", "Unknown") agent_type = row.get("agent_type", "unknown") provider = row.get("provider", "unknown") success_rate = row.get("success_rate", 0.0) total_tests = row.get("total_tests", 0) successful_tests = row.get("successful_tests", 0) failed_tests = row.get("failed_tests", 0) avg_steps = row.get("avg_steps", 0.0) avg_duration_ms = row.get("avg_duration_ms", 0.0) total_tokens = row.get("total_tokens", 0) total_cost_usd = row.get("total_cost_usd", 0.0) co2_emissions_g = row.get("co2_emissions_g", 0.0) gpu_utilization_avg = row.get("gpu_utilization_avg", None) timestamp = row.get("timestamp", "") submitted_by = row.get("submitted_by", "Unknown") run_id = row.get("run_id", "N/A") has_gpu = pd.notna(gpu_utilization_avg) and gpu_utilization_avg > 0 gpu_display = ( get_gpu_utilization_bar(gpu_utilization_avg) if has_gpu else 'N/A' ) co2_display = ( f"{co2_emissions_g:.2f}g" if pd.notna(co2_emissions_g) and co2_emissions_g > 0 else 'N/A' ) timestamp_display = str(timestamp)[:16] if pd.notna(timestamp) else "N/A" run_id_short = run_id[:8] + "..." if len(run_id) > 8 else run_id data_attrs_dict = { f"data-{key.replace('_', '-')}": value for key, value in row.to_dict().items() } data_attrs = " ".join( [f'{key}="{value}"' for key, value in data_attrs_dict.items()] ) html += f""" """ html += """
Rank Run ID Model Type Provider Hardware Success Rate Tests (P/F) Steps Duration Tokens Cost CO2 GPU Util Timestamp Submitted By Actions
{get_rank_badge(rank)} {run_id_short} {model} {get_agent_type_badge(agent_type)} {get_provider_badge(provider)} {get_hardware_badge(has_gpu)} {get_success_rate_bar(success_rate)} {total_tests} / {successful_tests} / {failed_tests} {avg_steps:.1f} {format_duration(avg_duration_ms)} {total_tokens:,} {format_cost(total_cost_usd)} {co2_display} {gpu_display} {timestamp_display} {submitted_by}
""" return html