Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,7 +2,6 @@ import json
|
|
| 2 |
import os
|
| 3 |
import random
|
| 4 |
import sqlite3
|
| 5 |
-
from dataclasses import dataclass
|
| 6 |
from datetime import datetime
|
| 7 |
from typing import Dict, List, Tuple, Optional
|
| 8 |
|
|
@@ -33,12 +32,13 @@ CSS_PATH = os.path.join("assets", "zen.css")
|
|
| 33 |
DEFAULT_RATING = 1200.0
|
| 34 |
K_FACTOR = 16.0
|
| 35 |
|
|
|
|
| 36 |
# ---------------------------
|
| 37 |
-
#
|
| 38 |
# ---------------------------
|
| 39 |
def guess_provider(model: str) -> str:
|
| 40 |
-
m = model.lower()
|
| 41 |
-
if "gpt" in m or "chatgpt" in m or "o3"
|
| 42 |
return "OpenAI"
|
| 43 |
if "gemini" in m or "veo" in m:
|
| 44 |
return "Google"
|
|
@@ -52,8 +52,6 @@ def guess_provider(model: str) -> str:
|
|
| 52 |
return "Black Forest Labs"
|
| 53 |
if "kling" in m:
|
| 54 |
return "Kuaishou"
|
| 55 |
-
if "sora" in m:
|
| 56 |
-
return "OpenAI"
|
| 57 |
if "wan" in m:
|
| 58 |
return "WAN"
|
| 59 |
if "hunyuan" in m:
|
|
@@ -62,10 +60,11 @@ def guess_provider(model: str) -> str:
|
|
| 62 |
return "ByteDance"
|
| 63 |
return "Other"
|
| 64 |
|
|
|
|
| 65 |
# ---------------------------
|
| 66 |
# SQLite persistence
|
| 67 |
# ---------------------------
|
| 68 |
-
def db():
|
| 69 |
conn = sqlite3.connect(DB_PATH, check_same_thread=False)
|
| 70 |
conn.execute(
|
| 71 |
"""
|
|
@@ -94,11 +93,19 @@ def db():
|
|
| 94 |
conn.commit()
|
| 95 |
return conn
|
| 96 |
|
|
|
|
| 97 |
def now_iso() -> str:
|
| 98 |
return datetime.utcnow().isoformat(timespec="seconds") + "Z"
|
| 99 |
|
| 100 |
-
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
with FileLock(DB_LOCK):
|
| 103 |
conn = db()
|
| 104 |
conn.execute(
|
|
@@ -108,6 +115,7 @@ def ensure_model(arena: str, model: str, provider: Optional[str] = None, default
|
|
| 108 |
conn.commit()
|
| 109 |
conn.close()
|
| 110 |
|
|
|
|
| 111 |
def get_rating(arena: str, model: str) -> Tuple[float, int, str]:
|
| 112 |
with FileLock(DB_LOCK):
|
| 113 |
conn = db()
|
|
@@ -115,10 +123,12 @@ def get_rating(arena: str, model: str) -> Tuple[float, int, str]:
|
|
| 115 |
cur.execute("SELECT rating, votes, provider FROM ratings WHERE arena=? AND model=?", (arena, model))
|
| 116 |
row = cur.fetchone()
|
| 117 |
conn.close()
|
|
|
|
| 118 |
if row is None:
|
| 119 |
return (DEFAULT_RATING, 0, guess_provider(model))
|
| 120 |
return (float(row[0]), int(row[1]), str(row[2]))
|
| 121 |
|
|
|
|
| 122 |
def elo_update(r_a: float, r_b: float, a_wins: bool, k: float = K_FACTOR) -> Tuple[float, float]:
|
| 123 |
ea = 1.0 / (1.0 + 10 ** ((r_b - r_a) / 400.0))
|
| 124 |
sa = 1.0 if a_wins else 0.0
|
|
@@ -126,12 +136,14 @@ def elo_update(r_a: float, r_b: float, a_wins: bool, k: float = K_FACTOR) -> Tup
|
|
| 126 |
new_b = r_b + k * ((1.0 - sa) - (1.0 - ea))
|
| 127 |
return new_a, new_b
|
| 128 |
|
| 129 |
-
|
|
|
|
| 130 |
ensure_model(arena, winner)
|
| 131 |
ensure_model(arena, loser)
|
| 132 |
|
| 133 |
r_w, v_w, p_w = get_rating(arena, winner)
|
| 134 |
r_l, v_l, p_l = get_rating(arena, loser)
|
|
|
|
| 135 |
new_w, new_l = elo_update(r_w, r_l, True)
|
| 136 |
|
| 137 |
with FileLock(DB_LOCK):
|
|
@@ -151,9 +163,10 @@ def vote(arena: str, winner: str, loser: str):
|
|
| 151 |
conn.commit()
|
| 152 |
conn.close()
|
| 153 |
|
| 154 |
-
|
|
|
|
| 155 |
if not os.path.exists(SEED_PATH):
|
| 156 |
-
return {"seeded_rows": 0, "note": "
|
| 157 |
|
| 158 |
with open(SEED_PATH, "r", encoding="utf-8") as f:
|
| 159 |
seed = json.load(f)
|
|
@@ -165,13 +178,16 @@ def seed_from_json(force: bool = False) -> Dict[str, int]:
|
|
| 165 |
|
| 166 |
if force:
|
| 167 |
cur.execute("DELETE FROM ratings")
|
|
|
|
| 168 |
conn.commit()
|
| 169 |
|
| 170 |
for arena, rows in seed.items():
|
| 171 |
if arena not in ARENAS:
|
| 172 |
continue
|
| 173 |
for item in rows:
|
| 174 |
-
model = item
|
|
|
|
|
|
|
| 175 |
score = float(item.get("score", DEFAULT_RATING))
|
| 176 |
votes_n = int(item.get("votes", 0))
|
| 177 |
provider = guess_provider(model)
|
|
@@ -188,9 +204,10 @@ def seed_from_json(force: bool = False) -> Dict[str, int]:
|
|
| 188 |
conn.commit()
|
| 189 |
conn.close()
|
| 190 |
|
| 191 |
-
return {"seeded_rows": seeded, "note": "
|
|
|
|
| 192 |
|
| 193 |
-
def ensure_seed_once():
|
| 194 |
with FileLock(DB_LOCK):
|
| 195 |
conn = db()
|
| 196 |
cur = conn.cursor()
|
|
@@ -200,26 +217,54 @@ def ensure_seed_once():
|
|
| 200 |
if n == 0:
|
| 201 |
seed_from_json(force=False)
|
| 202 |
|
|
|
|
| 203 |
# ---------------------------
|
| 204 |
-
#
|
| 205 |
# ---------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
def leaderboard_df(arena: str, search: str = "", provider: str = "All", min_votes: int = 0, limit: int = 100) -> pd.DataFrame:
|
| 207 |
ensure_seed_once()
|
| 208 |
-
|
|
|
|
|
|
|
| 209 |
provider = provider or "All"
|
|
|
|
| 210 |
|
| 211 |
where = ["arena = ?"]
|
| 212 |
params: List[object] = [arena]
|
| 213 |
|
| 214 |
if search:
|
| 215 |
where.append("LOWER(model) LIKE ?")
|
| 216 |
-
params.append(f"%{search
|
|
|
|
| 217 |
if provider != "All":
|
| 218 |
where.append("provider = ?")
|
| 219 |
params.append(provider)
|
|
|
|
| 220 |
if min_votes > 0:
|
| 221 |
where.append("votes >= ?")
|
| 222 |
-
params.append(
|
| 223 |
|
| 224 |
where_sql = " AND ".join(where)
|
| 225 |
q = f"""
|
|
@@ -244,39 +289,18 @@ def leaderboard_df(arena: str, search: str = "", provider: str = "All", min_vote
|
|
| 244 |
df.insert(0, "Rank", np.arange(1, len(df) + 1))
|
| 245 |
return df[["Rank", "Model", "Provider", "Score", "Votes", "Updated"]]
|
| 246 |
|
| 247 |
-
def providers_list() -> List[str]:
|
| 248 |
-
ensure_seed_once()
|
| 249 |
-
with FileLock(DB_LOCK):
|
| 250 |
-
conn = db()
|
| 251 |
-
cur = conn.cursor()
|
| 252 |
-
cur.execute("SELECT DISTINCT provider FROM ratings ORDER BY provider ASC")
|
| 253 |
-
rows = [r[0] for r in cur.fetchall()]
|
| 254 |
-
conn.close()
|
| 255 |
-
return ["All"] + rows
|
| 256 |
-
|
| 257 |
-
def all_models() -> List[str]:
|
| 258 |
-
ensure_seed_once()
|
| 259 |
-
with FileLock(DB_LOCK):
|
| 260 |
-
conn = db()
|
| 261 |
-
cur = conn.cursor()
|
| 262 |
-
cur.execute("SELECT DISTINCT model FROM ratings ORDER BY model ASC")
|
| 263 |
-
rows = [r[0] for r in cur.fetchall()]
|
| 264 |
-
conn.close()
|
| 265 |
-
return rows
|
| 266 |
|
| 267 |
def arena_overview_matrix(search: str = "", provider: str = "All", min_votes: int = 0, limit_models: int = 200) -> pd.DataFrame:
|
| 268 |
-
"""
|
| 269 |
-
Wide matrix: each model gets Rank/Score/Votes per arena (compact).
|
| 270 |
-
"""
|
| 271 |
ensure_seed_once()
|
|
|
|
| 272 |
search = (search or "").strip().lower()
|
|
|
|
|
|
|
|
|
|
| 273 |
|
| 274 |
with FileLock(DB_LOCK):
|
| 275 |
conn = db()
|
| 276 |
-
base = pd.read_sql_query(
|
| 277 |
-
"SELECT arena, model, provider, rating, votes FROM ratings",
|
| 278 |
-
conn
|
| 279 |
-
)
|
| 280 |
conn.close()
|
| 281 |
|
| 282 |
if base.empty:
|
|
@@ -285,15 +309,16 @@ def arena_overview_matrix(search: str = "", provider: str = "All", min_votes: in
|
|
| 285 |
if provider != "All":
|
| 286 |
base = base[base["provider"] == provider]
|
| 287 |
if min_votes > 0:
|
| 288 |
-
base = base[base["votes"] >=
|
| 289 |
if search:
|
| 290 |
base = base[base["model"].str.lower().str.contains(search, na=False)]
|
| 291 |
|
| 292 |
-
|
|
|
|
|
|
|
| 293 |
base["rank"] = base.groupby("arena")["rating"].rank(ascending=False, method="min").astype(int)
|
| 294 |
base["score"] = base["rating"].round().astype(int)
|
| 295 |
|
| 296 |
-
# choose top N models by "best average rank across arenas they appear in"
|
| 297 |
pivot_rank = base.pivot_table(index=["model", "provider"], columns="arena", values="rank", aggfunc="min")
|
| 298 |
avg_rank = pivot_rank.mean(axis=1, skipna=True).sort_values()
|
| 299 |
chosen = avg_rank.head(limit_models).index
|
|
@@ -301,31 +326,31 @@ def arena_overview_matrix(search: str = "", provider: str = "All", min_votes: in
|
|
| 301 |
base = base.set_index(["model", "provider"])
|
| 302 |
base = base.loc[base.index.isin(chosen)].reset_index()
|
| 303 |
|
| 304 |
-
|
| 305 |
-
out = pd.DataFrame({"Model": [m for (m, p) in chosen], "Provider": [p for (m, p) in chosen]})
|
| 306 |
-
out = out.reset_index(drop=True)
|
| 307 |
|
| 308 |
for a in ARENAS:
|
| 309 |
-
sub = base[base["arena"] == a][["model", "provider", "rank", "score", "votes"]]
|
| 310 |
sub = sub.rename(columns={
|
| 311 |
"rank": f"{a} Rank",
|
| 312 |
"score": f"{a} Score",
|
| 313 |
"votes": f"{a} Votes",
|
| 314 |
})
|
| 315 |
out = out.merge(sub, how="left", left_on=["Model", "Provider"], right_on=["model", "provider"])
|
| 316 |
-
|
|
|
|
|
|
|
| 317 |
|
| 318 |
-
# sort by best avg rank
|
| 319 |
out["_avg_rank"] = out[[f"{a} Rank" for a in ARENAS]].mean(axis=1, skipna=True)
|
| 320 |
out = out.sort_values("_avg_rank", ascending=True).drop(columns=["_avg_rank"])
|
| 321 |
|
| 322 |
-
# nicer types
|
| 323 |
for a in ARENAS:
|
| 324 |
for col in [f"{a} Rank", f"{a} Score", f"{a} Votes"]:
|
| 325 |
if col in out.columns:
|
| 326 |
out[col] = out[col].astype("Int64")
|
|
|
|
| 327 |
return out
|
| 328 |
|
|
|
|
| 329 |
def kpis() -> Dict[str, str]:
|
| 330 |
ensure_seed_once()
|
| 331 |
with FileLock(DB_LOCK):
|
|
@@ -336,7 +361,7 @@ def kpis() -> Dict[str, str]:
|
|
| 336 |
cur.execute("SELECT COUNT(*) FROM ratings")
|
| 337 |
rows = cur.fetchone()[0]
|
| 338 |
cur.execute("SELECT COUNT(*) FROM votes_log")
|
| 339 |
-
|
| 340 |
cur.execute("SELECT MAX(created_at) FROM votes_log")
|
| 341 |
last_vote = cur.fetchone()[0]
|
| 342 |
conn.close()
|
|
@@ -344,75 +369,81 @@ def kpis() -> Dict[str, str]:
|
|
| 344 |
return {
|
| 345 |
"models": str(models),
|
| 346 |
"entries": str(rows),
|
| 347 |
-
"votes": str(
|
| 348 |
"last_vote": last_vote or "—",
|
| 349 |
}
|
| 350 |
|
|
|
|
| 351 |
# ---------------------------
|
| 352 |
-
# Voting
|
| 353 |
# ---------------------------
|
| 354 |
def pick_pair(arena: str, provider: str = "All") -> Tuple[str, str]:
|
| 355 |
df = leaderboard_df(arena, provider=provider, min_votes=0, limit=50)
|
| 356 |
models = df["Model"].tolist()
|
| 357 |
if len(models) < 2:
|
| 358 |
-
models =
|
| 359 |
if len(models) < 2:
|
| 360 |
return ("model-a", "model-b")
|
| 361 |
-
|
| 362 |
-
|
| 363 |
|
| 364 |
def model_card_md(model: str, arena: Optional[str] = None) -> str:
|
| 365 |
provider = guess_provider(model)
|
| 366 |
-
|
| 367 |
if arena:
|
| 368 |
-
r, v,
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
f"**Score:** {int(round(r))}",
|
| 373 |
-
f"**Votes:** {v}",
|
| 374 |
-
]
|
| 375 |
-
return "\n".join(lines)
|
| 376 |
|
| 377 |
def model_profile(model: str) -> Tuple[pd.DataFrame, str]:
|
| 378 |
ensure_seed_once()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 379 |
rows = []
|
| 380 |
for a in ARENAS:
|
| 381 |
r, v, p = get_rating(a, model)
|
| 382 |
rows.append({"Arena": a, "Score": int(round(r)), "Votes": v, "Provider": p})
|
| 383 |
df = pd.DataFrame(rows).sort_values("Score", ascending=False)
|
|
|
|
| 384 |
best = df.iloc[0]
|
| 385 |
worst = df.iloc[-1]
|
| 386 |
summary = (
|
| 387 |
-
|
| 388 |
-
|
| 389 |
f"<div class='zen-sub'><b>{model}</b> · Provider: <b>{guess_provider(model)}</b></div>"
|
| 390 |
-
|
| 391 |
-
|
| 392 |
f"Best arena: <b>{best['Arena']}</b> (Score {best['Score']}, Votes {best['Votes']}). "
|
| 393 |
f"Worst arena: <b>{worst['Arena']}</b> (Score {worst['Score']}, Votes {worst['Votes']})."
|
| 394 |
-
|
| 395 |
-
|
| 396 |
)
|
| 397 |
return df, summary
|
| 398 |
|
|
|
|
| 399 |
# ---------------------------
|
| 400 |
-
#
|
| 401 |
# ---------------------------
|
| 402 |
ensure_seed_once()
|
|
|
|
| 403 |
css = ""
|
| 404 |
if os.path.exists(CSS_PATH):
|
| 405 |
with open(CSS_PATH, "r", encoding="utf-8") as f:
|
| 406 |
css = f.read()
|
| 407 |
|
| 408 |
-
with gr.Blocks(
|
|
|
|
|
|
|
| 409 |
|
| 410 |
k = kpis()
|
| 411 |
header = f"""
|
| 412 |
<div class="zen-card">
|
| 413 |
<div class="zen-title">ZEN Model Arena Leaderboard</div>
|
| 414 |
<p class="zen-sub">
|
| 415 |
-
|
| 416 |
</p>
|
| 417 |
<div class="zen-kpi">
|
| 418 |
<div><div class="k">Models</div><div class="v">{k['models']}</div><div class="s">unique IDs tracked</div></div>
|
|
@@ -421,9 +452,9 @@ with gr.Blocks(css=css, title="ZEN Model Arena Leaderboard") as demo:
|
|
| 421 |
<div><div class="k">Last Vote</div><div class="v" style="font-size:12px; font-weight:700;">{k['last_vote']}</div><div class="s">UTC</div></div>
|
| 422 |
</div>
|
| 423 |
<div class="zen-hr"></div>
|
| 424 |
-
<span class="zen-badge">Gradio
|
| 425 |
<span class="zen-badge">SQLite + FileLock</span>
|
| 426 |
-
<span class="zen-badge">
|
| 427 |
<span class="zen-badge">Search + Filters</span>
|
| 428 |
</div>
|
| 429 |
"""
|
|
@@ -433,19 +464,17 @@ with gr.Blocks(css=css, title="ZEN Model Arena Leaderboard") as demo:
|
|
| 433 |
|
| 434 |
with gr.Tabs():
|
| 435 |
|
| 436 |
-
# ---------------------------
|
| 437 |
# Overview
|
| 438 |
-
# ---------------------------
|
| 439 |
with gr.Tab("Leaderboard Overview"):
|
| 440 |
-
gr.Markdown("### Top 10
|
| 441 |
|
| 442 |
with gr.Row():
|
| 443 |
arena_sel_ov = gr.Dropdown(choices=ARENAS, value="Text", label="Arena")
|
| 444 |
-
provider_sel_ov = gr.Dropdown(choices=provider_choices, value="All", label="Provider
|
| 445 |
min_votes_ov = gr.Slider(0, 100000, value=0, step=50, label="Min votes")
|
| 446 |
-
search_ov = gr.Textbox(value="", label="Search models", placeholder="
|
| 447 |
|
| 448 |
-
df_ov = gr.Dataframe(interactive=False, wrap=True,
|
| 449 |
refresh_ov = gr.Button("Refresh overview", variant="primary")
|
| 450 |
|
| 451 |
def refresh_overview(arena, provider, min_votes, search):
|
|
@@ -454,19 +483,17 @@ with gr.Blocks(css=css, title="ZEN Model Arena Leaderboard") as demo:
|
|
| 454 |
refresh_ov.click(refresh_overview, inputs=[arena_sel_ov, provider_sel_ov, min_votes_ov, search_ov], outputs=[df_ov])
|
| 455 |
demo.load(refresh_overview, inputs=[arena_sel_ov, provider_sel_ov, min_votes_ov, search_ov], outputs=[df_ov])
|
| 456 |
|
| 457 |
-
#
|
| 458 |
-
# Arena Overview Matrix (the cool part)
|
| 459 |
-
# ---------------------------
|
| 460 |
with gr.Tab("Arena Overview Matrix"):
|
| 461 |
-
gr.Markdown("### Cross-arena placements
|
| 462 |
|
| 463 |
with gr.Row():
|
| 464 |
provider_sel_mx = gr.Dropdown(choices=provider_choices, value="All", label="Provider")
|
| 465 |
min_votes_mx = gr.Slider(0, 100000, value=0, step=50, label="Min votes")
|
| 466 |
-
search_mx = gr.Textbox(value="", label="Search"
|
| 467 |
-
limit_mx = gr.Slider(10, 400, value=200, step=10, label="Max models
|
| 468 |
|
| 469 |
-
mx = gr.Dataframe(interactive=False, wrap=True,
|
| 470 |
refresh_mx = gr.Button("Build / Refresh Matrix", variant="primary")
|
| 471 |
|
| 472 |
def build_matrix(provider, min_votes, search, limit_models):
|
|
@@ -475,39 +502,34 @@ with gr.Blocks(css=css, title="ZEN Model Arena Leaderboard") as demo:
|
|
| 475 |
refresh_mx.click(build_matrix, inputs=[provider_sel_mx, min_votes_mx, search_mx, limit_mx], outputs=[mx])
|
| 476 |
demo.load(build_matrix, inputs=[provider_sel_mx, min_votes_mx, search_mx, limit_mx], outputs=[mx])
|
| 477 |
|
| 478 |
-
#
|
| 479 |
-
# Per-arena deep view
|
| 480 |
-
# ---------------------------
|
| 481 |
for arena in ARENAS:
|
| 482 |
with gr.Tab(arena):
|
| 483 |
-
gr.Markdown(f"### {arena} Leaderboard (live DB)
|
| 484 |
|
| 485 |
with gr.Row():
|
| 486 |
provider_sel = gr.Dropdown(choices=provider_choices, value="All", label="Provider")
|
| 487 |
min_votes = gr.Slider(0, 100000, value=0, step=50, label="Min votes")
|
| 488 |
search = gr.Textbox(value="", label="Search")
|
| 489 |
|
| 490 |
-
df = gr.Dataframe(interactive=False, wrap=True,
|
| 491 |
btn = gr.Button("Refresh", variant="primary")
|
| 492 |
|
| 493 |
-
btn.click(
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
|
|
|
|
|
|
| 497 |
|
| 498 |
-
#
|
| 499 |
-
# Voting (pairwise)
|
| 500 |
-
# ---------------------------
|
| 501 |
with gr.Tab("Start Voting"):
|
| 502 |
-
gr.Markdown(
|
| 503 |
-
"### Pairwise Voting (Elo)\n"
|
| 504 |
-
"Pick which model wins for a chosen arena. Scores update instantly and feed the Matrix."
|
| 505 |
-
)
|
| 506 |
|
| 507 |
with gr.Row():
|
| 508 |
arena_vote = gr.Dropdown(choices=ARENAS, value="Text", label="Arena")
|
| 509 |
provider_vote = gr.Dropdown(choices=provider_choices, value="All", label="Provider pool")
|
| 510 |
-
|
| 511 |
|
| 512 |
left_state = gr.State("")
|
| 513 |
right_state = gr.State("")
|
|
@@ -526,56 +548,46 @@ with gr.Blocks(css=css, title="ZEN Model Arena Leaderboard") as demo:
|
|
| 526 |
a, b = pick_pair(arena, provider=provider)
|
| 527 |
return model_card_md(a, arena), model_card_md(b, arena), a, b, "<div class='zen-note'>New matchup ready.</div>"
|
| 528 |
|
| 529 |
-
new_match.click(new_matchup, inputs=[arena_vote, provider_vote], outputs=[left_md, right_md, left_state, right_state, vote_status])
|
| 530 |
-
demo.load(new_matchup, inputs=[arena_vote, provider_vote], outputs=[left_md, right_md, left_state, right_state, vote_status])
|
| 531 |
-
|
| 532 |
def left_wins(arena, left, right, provider):
|
| 533 |
-
if
|
| 534 |
-
|
| 535 |
-
vote(arena, winner=left, loser=right)
|
| 536 |
return new_matchup(arena, provider)
|
| 537 |
|
| 538 |
def right_wins(arena, left, right, provider):
|
| 539 |
-
if
|
| 540 |
-
|
| 541 |
-
vote(arena, winner=right, loser=left)
|
| 542 |
return new_matchup(arena, provider)
|
| 543 |
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
right_btn.click(right_wins, inputs=[arena_vote, left_state, right_state, provider_vote],
|
| 547 |
-
|
| 548 |
|
| 549 |
-
#
|
| 550 |
-
# Model Profiles
|
| 551 |
-
# ---------------------------
|
| 552 |
with gr.Tab("Model Profiles"):
|
| 553 |
-
gr.Markdown("### Inspect a model across arenas
|
| 554 |
|
| 555 |
-
|
|
|
|
| 556 |
prof_summary = gr.HTML()
|
| 557 |
-
prof_df = gr.Dataframe(interactive=False, wrap=True,
|
| 558 |
-
|
| 559 |
load_btn = gr.Button("Load Profile", variant="primary")
|
|
|
|
| 560 |
load_btn.click(model_profile, inputs=[model_dd], outputs=[prof_df, prof_summary])
|
| 561 |
demo.load(model_profile, inputs=[model_dd], outputs=[prof_df, prof_summary])
|
| 562 |
|
| 563 |
-
#
|
| 564 |
-
# Admin / Ops
|
| 565 |
-
# ---------------------------
|
| 566 |
with gr.Tab("Admin"):
|
| 567 |
-
gr.Markdown("### Admin Tools
|
| 568 |
|
| 569 |
with gr.Row():
|
| 570 |
-
reseed_force = gr.Checkbox(value=False, label="Force reseed (
|
| 571 |
reseed_btn = gr.Button("Seed from data/seed_snapshot.json", variant="primary")
|
| 572 |
|
| 573 |
reseed_out = gr.JSON()
|
|
|
|
| 574 |
|
| 575 |
-
|
| 576 |
-
inputs=[reseed_force], outputs=[reseed_out])
|
| 577 |
-
|
| 578 |
-
gr.Markdown("#### Add a model to one or more arenas")
|
| 579 |
new_model = gr.Textbox(label="Model ID", placeholder="e.g., gpt-5.2, gemini-3-pro, claude-opus-4-5-…")
|
| 580 |
new_provider = gr.Textbox(label="Provider (optional)", placeholder="Leave blank for auto-detect")
|
| 581 |
arenas_add = gr.CheckboxGroup(choices=ARENAS, value=["Text"], label="Arenas")
|
|
@@ -595,8 +607,9 @@ with gr.Blocks(css=css, title="ZEN Model Arena Leaderboard") as demo:
|
|
| 595 |
|
| 596 |
add_btn.click(add_model, inputs=[new_model, new_provider, arenas_add], outputs=[add_out])
|
| 597 |
|
| 598 |
-
gr.Markdown("####
|
| 599 |
sanity = gr.JSON()
|
|
|
|
| 600 |
def sanity_check():
|
| 601 |
return {
|
| 602 |
"time_utc": now_iso(),
|
|
@@ -607,6 +620,7 @@ with gr.Blocks(css=css, title="ZEN Model Arena Leaderboard") as demo:
|
|
| 607 |
"providers_detected": providers_list(),
|
| 608 |
"models_count": len(all_models()),
|
| 609 |
}
|
|
|
|
| 610 |
demo.load(sanity_check, outputs=[sanity])
|
| 611 |
|
| 612 |
gr.close_all()
|
|
|
|
| 2 |
import os
|
| 3 |
import random
|
| 4 |
import sqlite3
|
|
|
|
| 5 |
from datetime import datetime
|
| 6 |
from typing import Dict, List, Tuple, Optional
|
| 7 |
|
|
|
|
| 32 |
DEFAULT_RATING = 1200.0
|
| 33 |
K_FACTOR = 16.0
|
| 34 |
|
| 35 |
+
|
| 36 |
# ---------------------------
|
| 37 |
+
# Provider tagging (heuristic)
|
| 38 |
# ---------------------------
|
| 39 |
def guess_provider(model: str) -> str:
|
| 40 |
+
m = (model or "").lower()
|
| 41 |
+
if "gpt" in m or "chatgpt" in m or m.startswith("o3"):
|
| 42 |
return "OpenAI"
|
| 43 |
if "gemini" in m or "veo" in m:
|
| 44 |
return "Google"
|
|
|
|
| 52 |
return "Black Forest Labs"
|
| 53 |
if "kling" in m:
|
| 54 |
return "Kuaishou"
|
|
|
|
|
|
|
| 55 |
if "wan" in m:
|
| 56 |
return "WAN"
|
| 57 |
if "hunyuan" in m:
|
|
|
|
| 60 |
return "ByteDance"
|
| 61 |
return "Other"
|
| 62 |
|
| 63 |
+
|
| 64 |
# ---------------------------
|
| 65 |
# SQLite persistence
|
| 66 |
# ---------------------------
|
| 67 |
+
def db() -> sqlite3.Connection:
|
| 68 |
conn = sqlite3.connect(DB_PATH, check_same_thread=False)
|
| 69 |
conn.execute(
|
| 70 |
"""
|
|
|
|
| 93 |
conn.commit()
|
| 94 |
return conn
|
| 95 |
|
| 96 |
+
|
| 97 |
def now_iso() -> str:
|
| 98 |
return datetime.utcnow().isoformat(timespec="seconds") + "Z"
|
| 99 |
|
| 100 |
+
|
| 101 |
+
def ensure_model(arena: str, model: str, provider: Optional[str] = None, default_rating: float = DEFAULT_RATING) -> None:
|
| 102 |
+
if arena not in ARENAS:
|
| 103 |
+
return
|
| 104 |
+
model = (model or "").strip()
|
| 105 |
+
if not model:
|
| 106 |
+
return
|
| 107 |
+
provider = (provider or "").strip() or guess_provider(model)
|
| 108 |
+
|
| 109 |
with FileLock(DB_LOCK):
|
| 110 |
conn = db()
|
| 111 |
conn.execute(
|
|
|
|
| 115 |
conn.commit()
|
| 116 |
conn.close()
|
| 117 |
|
| 118 |
+
|
| 119 |
def get_rating(arena: str, model: str) -> Tuple[float, int, str]:
|
| 120 |
with FileLock(DB_LOCK):
|
| 121 |
conn = db()
|
|
|
|
| 123 |
cur.execute("SELECT rating, votes, provider FROM ratings WHERE arena=? AND model=?", (arena, model))
|
| 124 |
row = cur.fetchone()
|
| 125 |
conn.close()
|
| 126 |
+
|
| 127 |
if row is None:
|
| 128 |
return (DEFAULT_RATING, 0, guess_provider(model))
|
| 129 |
return (float(row[0]), int(row[1]), str(row[2]))
|
| 130 |
|
| 131 |
+
|
| 132 |
def elo_update(r_a: float, r_b: float, a_wins: bool, k: float = K_FACTOR) -> Tuple[float, float]:
|
| 133 |
ea = 1.0 / (1.0 + 10 ** ((r_b - r_a) / 400.0))
|
| 134 |
sa = 1.0 if a_wins else 0.0
|
|
|
|
| 136 |
new_b = r_b + k * ((1.0 - sa) - (1.0 - ea))
|
| 137 |
return new_a, new_b
|
| 138 |
|
| 139 |
+
|
| 140 |
+
def vote(arena: str, winner: str, loser: str) -> None:
|
| 141 |
ensure_model(arena, winner)
|
| 142 |
ensure_model(arena, loser)
|
| 143 |
|
| 144 |
r_w, v_w, p_w = get_rating(arena, winner)
|
| 145 |
r_l, v_l, p_l = get_rating(arena, loser)
|
| 146 |
+
|
| 147 |
new_w, new_l = elo_update(r_w, r_l, True)
|
| 148 |
|
| 149 |
with FileLock(DB_LOCK):
|
|
|
|
| 163 |
conn.commit()
|
| 164 |
conn.close()
|
| 165 |
|
| 166 |
+
|
| 167 |
+
def seed_from_json(force: bool = False) -> Dict[str, object]:
|
| 168 |
if not os.path.exists(SEED_PATH):
|
| 169 |
+
return {"ok": False, "seeded_rows": 0, "note": "Missing data/seed_snapshot.json"}
|
| 170 |
|
| 171 |
with open(SEED_PATH, "r", encoding="utf-8") as f:
|
| 172 |
seed = json.load(f)
|
|
|
|
| 178 |
|
| 179 |
if force:
|
| 180 |
cur.execute("DELETE FROM ratings")
|
| 181 |
+
cur.execute("DELETE FROM votes_log")
|
| 182 |
conn.commit()
|
| 183 |
|
| 184 |
for arena, rows in seed.items():
|
| 185 |
if arena not in ARENAS:
|
| 186 |
continue
|
| 187 |
for item in rows:
|
| 188 |
+
model = str(item.get("model", "")).strip()
|
| 189 |
+
if not model:
|
| 190 |
+
continue
|
| 191 |
score = float(item.get("score", DEFAULT_RATING))
|
| 192 |
votes_n = int(item.get("votes", 0))
|
| 193 |
provider = guess_provider(model)
|
|
|
|
| 204 |
conn.commit()
|
| 205 |
conn.close()
|
| 206 |
|
| 207 |
+
return {"ok": True, "seeded_rows": seeded, "note": "Seeded successfully"}
|
| 208 |
+
|
| 209 |
|
| 210 |
+
def ensure_seed_once() -> None:
|
| 211 |
with FileLock(DB_LOCK):
|
| 212 |
conn = db()
|
| 213 |
cur = conn.cursor()
|
|
|
|
| 217 |
if n == 0:
|
| 218 |
seed_from_json(force=False)
|
| 219 |
|
| 220 |
+
|
| 221 |
# ---------------------------
|
| 222 |
+
# Query helpers
|
| 223 |
# ---------------------------
|
| 224 |
+
def providers_list() -> List[str]:
|
| 225 |
+
ensure_seed_once()
|
| 226 |
+
with FileLock(DB_LOCK):
|
| 227 |
+
conn = db()
|
| 228 |
+
cur = conn.cursor()
|
| 229 |
+
cur.execute("SELECT DISTINCT provider FROM ratings ORDER BY provider ASC")
|
| 230 |
+
rows = [r[0] for r in cur.fetchall()]
|
| 231 |
+
conn.close()
|
| 232 |
+
return ["All"] + rows
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
def all_models() -> List[str]:
|
| 236 |
+
ensure_seed_once()
|
| 237 |
+
with FileLock(DB_LOCK):
|
| 238 |
+
conn = db()
|
| 239 |
+
cur = conn.cursor()
|
| 240 |
+
cur.execute("SELECT DISTINCT model FROM ratings ORDER BY model ASC")
|
| 241 |
+
rows = [r[0] for r in cur.fetchall()]
|
| 242 |
+
conn.close()
|
| 243 |
+
return rows
|
| 244 |
+
|
| 245 |
+
|
| 246 |
def leaderboard_df(arena: str, search: str = "", provider: str = "All", min_votes: int = 0, limit: int = 100) -> pd.DataFrame:
|
| 247 |
ensure_seed_once()
|
| 248 |
+
|
| 249 |
+
arena = arena if arena in ARENAS else "Text"
|
| 250 |
+
search = (search or "").strip().lower()
|
| 251 |
provider = provider or "All"
|
| 252 |
+
min_votes = int(min_votes or 0)
|
| 253 |
|
| 254 |
where = ["arena = ?"]
|
| 255 |
params: List[object] = [arena]
|
| 256 |
|
| 257 |
if search:
|
| 258 |
where.append("LOWER(model) LIKE ?")
|
| 259 |
+
params.append(f"%{search}%")
|
| 260 |
+
|
| 261 |
if provider != "All":
|
| 262 |
where.append("provider = ?")
|
| 263 |
params.append(provider)
|
| 264 |
+
|
| 265 |
if min_votes > 0:
|
| 266 |
where.append("votes >= ?")
|
| 267 |
+
params.append(min_votes)
|
| 268 |
|
| 269 |
where_sql = " AND ".join(where)
|
| 270 |
q = f"""
|
|
|
|
| 289 |
df.insert(0, "Rank", np.arange(1, len(df) + 1))
|
| 290 |
return df[["Rank", "Model", "Provider", "Score", "Votes", "Updated"]]
|
| 291 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
|
| 293 |
def arena_overview_matrix(search: str = "", provider: str = "All", min_votes: int = 0, limit_models: int = 200) -> pd.DataFrame:
|
|
|
|
|
|
|
|
|
|
| 294 |
ensure_seed_once()
|
| 295 |
+
|
| 296 |
search = (search or "").strip().lower()
|
| 297 |
+
provider = provider or "All"
|
| 298 |
+
min_votes = int(min_votes or 0)
|
| 299 |
+
limit_models = int(limit_models or 200)
|
| 300 |
|
| 301 |
with FileLock(DB_LOCK):
|
| 302 |
conn = db()
|
| 303 |
+
base = pd.read_sql_query("SELECT arena, model, provider, rating, votes FROM ratings", conn)
|
|
|
|
|
|
|
|
|
|
| 304 |
conn.close()
|
| 305 |
|
| 306 |
if base.empty:
|
|
|
|
| 309 |
if provider != "All":
|
| 310 |
base = base[base["provider"] == provider]
|
| 311 |
if min_votes > 0:
|
| 312 |
+
base = base[base["votes"] >= min_votes]
|
| 313 |
if search:
|
| 314 |
base = base[base["model"].str.lower().str.contains(search, na=False)]
|
| 315 |
|
| 316 |
+
if base.empty:
|
| 317 |
+
return pd.DataFrame()
|
| 318 |
+
|
| 319 |
base["rank"] = base.groupby("arena")["rating"].rank(ascending=False, method="min").astype(int)
|
| 320 |
base["score"] = base["rating"].round().astype(int)
|
| 321 |
|
|
|
|
| 322 |
pivot_rank = base.pivot_table(index=["model", "provider"], columns="arena", values="rank", aggfunc="min")
|
| 323 |
avg_rank = pivot_rank.mean(axis=1, skipna=True).sort_values()
|
| 324 |
chosen = avg_rank.head(limit_models).index
|
|
|
|
| 326 |
base = base.set_index(["model", "provider"])
|
| 327 |
base = base.loc[base.index.isin(chosen)].reset_index()
|
| 328 |
|
| 329 |
+
out = pd.DataFrame({"Model": [m for (m, p) in chosen], "Provider": [p for (m, p) in chosen]}).reset_index(drop=True)
|
|
|
|
|
|
|
| 330 |
|
| 331 |
for a in ARENAS:
|
| 332 |
+
sub = base[base["arena"] == a][["model", "provider", "rank", "score", "votes"]].copy()
|
| 333 |
sub = sub.rename(columns={
|
| 334 |
"rank": f"{a} Rank",
|
| 335 |
"score": f"{a} Score",
|
| 336 |
"votes": f"{a} Votes",
|
| 337 |
})
|
| 338 |
out = out.merge(sub, how="left", left_on=["Model", "Provider"], right_on=["model", "provider"])
|
| 339 |
+
for c in ["model", "provider"]:
|
| 340 |
+
if c in out.columns:
|
| 341 |
+
out.drop(columns=[c], inplace=True)
|
| 342 |
|
|
|
|
| 343 |
out["_avg_rank"] = out[[f"{a} Rank" for a in ARENAS]].mean(axis=1, skipna=True)
|
| 344 |
out = out.sort_values("_avg_rank", ascending=True).drop(columns=["_avg_rank"])
|
| 345 |
|
|
|
|
| 346 |
for a in ARENAS:
|
| 347 |
for col in [f"{a} Rank", f"{a} Score", f"{a} Votes"]:
|
| 348 |
if col in out.columns:
|
| 349 |
out[col] = out[col].astype("Int64")
|
| 350 |
+
|
| 351 |
return out
|
| 352 |
|
| 353 |
+
|
| 354 |
def kpis() -> Dict[str, str]:
|
| 355 |
ensure_seed_once()
|
| 356 |
with FileLock(DB_LOCK):
|
|
|
|
| 361 |
cur.execute("SELECT COUNT(*) FROM ratings")
|
| 362 |
rows = cur.fetchone()[0]
|
| 363 |
cur.execute("SELECT COUNT(*) FROM votes_log")
|
| 364 |
+
votes_n = cur.fetchone()[0]
|
| 365 |
cur.execute("SELECT MAX(created_at) FROM votes_log")
|
| 366 |
last_vote = cur.fetchone()[0]
|
| 367 |
conn.close()
|
|
|
|
| 369 |
return {
|
| 370 |
"models": str(models),
|
| 371 |
"entries": str(rows),
|
| 372 |
+
"votes": str(votes_n),
|
| 373 |
"last_vote": last_vote or "—",
|
| 374 |
}
|
| 375 |
|
| 376 |
+
|
| 377 |
# ---------------------------
|
| 378 |
+
# Voting / profiles
|
| 379 |
# ---------------------------
|
| 380 |
def pick_pair(arena: str, provider: str = "All") -> Tuple[str, str]:
|
| 381 |
df = leaderboard_df(arena, provider=provider, min_votes=0, limit=50)
|
| 382 |
models = df["Model"].tolist()
|
| 383 |
if len(models) < 2:
|
| 384 |
+
models = all_models()
|
| 385 |
if len(models) < 2:
|
| 386 |
return ("model-a", "model-b")
|
| 387 |
+
return tuple(random.sample(models, 2))
|
| 388 |
+
|
| 389 |
|
| 390 |
def model_card_md(model: str, arena: Optional[str] = None) -> str:
|
| 391 |
provider = guess_provider(model)
|
| 392 |
+
out = [f"### {model}", f"<span class='zen-badge'>{provider}</span>"]
|
| 393 |
if arena:
|
| 394 |
+
r, v, _ = get_rating(arena, model)
|
| 395 |
+
out += ["", f"**Arena:** {arena}", f"**Score:** {int(round(r))}", f"**Votes:** {v}"]
|
| 396 |
+
return "\n".join(out)
|
| 397 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 398 |
|
| 399 |
def model_profile(model: str) -> Tuple[pd.DataFrame, str]:
|
| 400 |
ensure_seed_once()
|
| 401 |
+
model = (model or "").strip()
|
| 402 |
+
if not model:
|
| 403 |
+
return pd.DataFrame(columns=["Arena", "Score", "Votes", "Provider"]), "<div class='zen-card'>No model selected.</div>"
|
| 404 |
+
|
| 405 |
rows = []
|
| 406 |
for a in ARENAS:
|
| 407 |
r, v, p = get_rating(a, model)
|
| 408 |
rows.append({"Arena": a, "Score": int(round(r)), "Votes": v, "Provider": p})
|
| 409 |
df = pd.DataFrame(rows).sort_values("Score", ascending=False)
|
| 410 |
+
|
| 411 |
best = df.iloc[0]
|
| 412 |
worst = df.iloc[-1]
|
| 413 |
summary = (
|
| 414 |
+
"<div class='zen-card'>"
|
| 415 |
+
"<div class='zen-title'>Model Profile</div>"
|
| 416 |
f"<div class='zen-sub'><b>{model}</b> · Provider: <b>{guess_provider(model)}</b></div>"
|
| 417 |
+
"<div class='zen-hr'></div>"
|
| 418 |
+
"<div class='zen-note'>"
|
| 419 |
f"Best arena: <b>{best['Arena']}</b> (Score {best['Score']}, Votes {best['Votes']}). "
|
| 420 |
f"Worst arena: <b>{worst['Arena']}</b> (Score {worst['Score']}, Votes {worst['Votes']})."
|
| 421 |
+
"</div>"
|
| 422 |
+
"</div>"
|
| 423 |
)
|
| 424 |
return df, summary
|
| 425 |
|
| 426 |
+
|
| 427 |
# ---------------------------
|
| 428 |
+
# App UI
|
| 429 |
# ---------------------------
|
| 430 |
ensure_seed_once()
|
| 431 |
+
|
| 432 |
css = ""
|
| 433 |
if os.path.exists(CSS_PATH):
|
| 434 |
with open(CSS_PATH, "r", encoding="utf-8") as f:
|
| 435 |
css = f.read()
|
| 436 |
|
| 437 |
+
with gr.Blocks(title="ZEN Model Arena Leaderboard") as demo:
|
| 438 |
+
if css:
|
| 439 |
+
gr.HTML(f"<style>{css}</style>")
|
| 440 |
|
| 441 |
k = kpis()
|
| 442 |
header = f"""
|
| 443 |
<div class="zen-card">
|
| 444 |
<div class="zen-title">ZEN Model Arena Leaderboard</div>
|
| 445 |
<p class="zen-sub">
|
| 446 |
+
Multi-arena rankings (Text · WebDev · Vision · Image · Video · Search) with a cross-arena overview matrix and live Elo voting.
|
| 447 |
</p>
|
| 448 |
<div class="zen-kpi">
|
| 449 |
<div><div class="k">Models</div><div class="v">{k['models']}</div><div class="s">unique IDs tracked</div></div>
|
|
|
|
| 452 |
<div><div class="k">Last Vote</div><div class="v" style="font-size:12px; font-weight:700;">{k['last_vote']}</div><div class="s">UTC</div></div>
|
| 453 |
</div>
|
| 454 |
<div class="zen-hr"></div>
|
| 455 |
+
<span class="zen-badge">Gradio 6.2.0</span>
|
| 456 |
<span class="zen-badge">SQLite + FileLock</span>
|
| 457 |
+
<span class="zen-badge">Arena Matrix</span>
|
| 458 |
<span class="zen-badge">Search + Filters</span>
|
| 459 |
</div>
|
| 460 |
"""
|
|
|
|
| 464 |
|
| 465 |
with gr.Tabs():
|
| 466 |
|
|
|
|
| 467 |
# Overview
|
|
|
|
| 468 |
with gr.Tab("Leaderboard Overview"):
|
| 469 |
+
gr.Markdown("### Top 10 (live DB)\nMirrors the snapshot format, but runs off the DB.")
|
| 470 |
|
| 471 |
with gr.Row():
|
| 472 |
arena_sel_ov = gr.Dropdown(choices=ARENAS, value="Text", label="Arena")
|
| 473 |
+
provider_sel_ov = gr.Dropdown(choices=provider_choices, value="All", label="Provider")
|
| 474 |
min_votes_ov = gr.Slider(0, 100000, value=0, step=50, label="Min votes")
|
| 475 |
+
search_ov = gr.Textbox(value="", label="Search models", placeholder="gpt, gemini, claude, flux...")
|
| 476 |
|
| 477 |
+
df_ov = gr.Dataframe(interactive=False, wrap=True, elem_classes=["zen-table-520"])
|
| 478 |
refresh_ov = gr.Button("Refresh overview", variant="primary")
|
| 479 |
|
| 480 |
def refresh_overview(arena, provider, min_votes, search):
|
|
|
|
| 483 |
refresh_ov.click(refresh_overview, inputs=[arena_sel_ov, provider_sel_ov, min_votes_ov, search_ov], outputs=[df_ov])
|
| 484 |
demo.load(refresh_overview, inputs=[arena_sel_ov, provider_sel_ov, min_votes_ov, search_ov], outputs=[df_ov])
|
| 485 |
|
| 486 |
+
# Matrix
|
|
|
|
|
|
|
| 487 |
with gr.Tab("Arena Overview Matrix"):
|
| 488 |
+
gr.Markdown("### Cross-arena placements\nRank/Score/Votes per arena in one wide matrix.")
|
| 489 |
|
| 490 |
with gr.Row():
|
| 491 |
provider_sel_mx = gr.Dropdown(choices=provider_choices, value="All", label="Provider")
|
| 492 |
min_votes_mx = gr.Slider(0, 100000, value=0, step=50, label="Min votes")
|
| 493 |
+
search_mx = gr.Textbox(value="", label="Search")
|
| 494 |
+
limit_mx = gr.Slider(10, 400, value=200, step=10, label="Max models")
|
| 495 |
|
| 496 |
+
mx = gr.Dataframe(interactive=False, wrap=True, elem_classes=["zen-table-600"])
|
| 497 |
refresh_mx = gr.Button("Build / Refresh Matrix", variant="primary")
|
| 498 |
|
| 499 |
def build_matrix(provider, min_votes, search, limit_models):
|
|
|
|
| 502 |
refresh_mx.click(build_matrix, inputs=[provider_sel_mx, min_votes_mx, search_mx, limit_mx], outputs=[mx])
|
| 503 |
demo.load(build_matrix, inputs=[provider_sel_mx, min_votes_mx, search_mx, limit_mx], outputs=[mx])
|
| 504 |
|
| 505 |
+
# Arena tabs
|
|
|
|
|
|
|
| 506 |
for arena in ARENAS:
|
| 507 |
with gr.Tab(arena):
|
| 508 |
+
gr.Markdown(f"### {arena} Leaderboard (live DB)")
|
| 509 |
|
| 510 |
with gr.Row():
|
| 511 |
provider_sel = gr.Dropdown(choices=provider_choices, value="All", label="Provider")
|
| 512 |
min_votes = gr.Slider(0, 100000, value=0, step=50, label="Min votes")
|
| 513 |
search = gr.Textbox(value="", label="Search")
|
| 514 |
|
| 515 |
+
df = gr.Dataframe(interactive=False, wrap=True, elem_classes=["zen-table-600"])
|
| 516 |
btn = gr.Button("Refresh", variant="primary")
|
| 517 |
|
| 518 |
+
btn.click(
|
| 519 |
+
lambda p, mv, s, a=arena: leaderboard_df(a, search=s, provider=p, min_votes=int(mv), limit=150),
|
| 520 |
+
inputs=[provider_sel, min_votes, search],
|
| 521 |
+
outputs=[df],
|
| 522 |
+
)
|
| 523 |
+
demo.load(lambda a=arena: leaderboard_df(a, limit=150), outputs=[df])
|
| 524 |
|
| 525 |
+
# Voting
|
|
|
|
|
|
|
| 526 |
with gr.Tab("Start Voting"):
|
| 527 |
+
gr.Markdown("### Pairwise Voting (Elo)\nPick a winner for a specific arena. Scores update instantly.")
|
|
|
|
|
|
|
|
|
|
| 528 |
|
| 529 |
with gr.Row():
|
| 530 |
arena_vote = gr.Dropdown(choices=ARENAS, value="Text", label="Arena")
|
| 531 |
provider_vote = gr.Dropdown(choices=provider_choices, value="All", label="Provider pool")
|
| 532 |
+
new_match_btn = gr.Button("New Matchup", variant="primary")
|
| 533 |
|
| 534 |
left_state = gr.State("")
|
| 535 |
right_state = gr.State("")
|
|
|
|
| 548 |
a, b = pick_pair(arena, provider=provider)
|
| 549 |
return model_card_md(a, arena), model_card_md(b, arena), a, b, "<div class='zen-note'>New matchup ready.</div>"
|
| 550 |
|
|
|
|
|
|
|
|
|
|
| 551 |
def left_wins(arena, left, right, provider):
|
| 552 |
+
if left and right:
|
| 553 |
+
vote(arena, winner=left, loser=right)
|
|
|
|
| 554 |
return new_matchup(arena, provider)
|
| 555 |
|
| 556 |
def right_wins(arena, left, right, provider):
|
| 557 |
+
if left and right:
|
| 558 |
+
vote(arena, winner=right, loser=left)
|
|
|
|
| 559 |
return new_matchup(arena, provider)
|
| 560 |
|
| 561 |
+
new_match_btn.click(new_matchup, inputs=[arena_vote, provider_vote], outputs=[left_md, right_md, left_state, right_state, vote_status])
|
| 562 |
+
left_btn.click(left_wins, inputs=[arena_vote, left_state, right_state, provider_vote], outputs=[left_md, right_md, left_state, right_state, vote_status])
|
| 563 |
+
right_btn.click(right_wins, inputs=[arena_vote, left_state, right_state, provider_vote], outputs=[left_md, right_md, left_state, right_state, vote_status])
|
| 564 |
+
demo.load(new_matchup, inputs=[arena_vote, provider_vote], outputs=[left_md, right_md, left_state, right_state, vote_status])
|
| 565 |
|
| 566 |
+
# Profiles
|
|
|
|
|
|
|
| 567 |
with gr.Tab("Model Profiles"):
|
| 568 |
+
gr.Markdown("### Inspect a model across arenas")
|
| 569 |
|
| 570 |
+
models = all_models()
|
| 571 |
+
model_dd = gr.Dropdown(choices=models, value=(models[0] if models else None), label="Model")
|
| 572 |
prof_summary = gr.HTML()
|
| 573 |
+
prof_df = gr.Dataframe(interactive=False, wrap=True, elem_classes=["zen-table-520"])
|
|
|
|
| 574 |
load_btn = gr.Button("Load Profile", variant="primary")
|
| 575 |
+
|
| 576 |
load_btn.click(model_profile, inputs=[model_dd], outputs=[prof_df, prof_summary])
|
| 577 |
demo.load(model_profile, inputs=[model_dd], outputs=[prof_df, prof_summary])
|
| 578 |
|
| 579 |
+
# Admin
|
|
|
|
|
|
|
| 580 |
with gr.Tab("Admin"):
|
| 581 |
+
gr.Markdown("### Admin Tools")
|
| 582 |
|
| 583 |
with gr.Row():
|
| 584 |
+
reseed_force = gr.Checkbox(value=False, label="Force reseed (wipe DB first)")
|
| 585 |
reseed_btn = gr.Button("Seed from data/seed_snapshot.json", variant="primary")
|
| 586 |
|
| 587 |
reseed_out = gr.JSON()
|
| 588 |
+
reseed_btn.click(lambda force: seed_from_json(force=bool(force)), inputs=[reseed_force], outputs=[reseed_out])
|
| 589 |
|
| 590 |
+
gr.Markdown("#### Add a model to arenas")
|
|
|
|
|
|
|
|
|
|
| 591 |
new_model = gr.Textbox(label="Model ID", placeholder="e.g., gpt-5.2, gemini-3-pro, claude-opus-4-5-…")
|
| 592 |
new_provider = gr.Textbox(label="Provider (optional)", placeholder="Leave blank for auto-detect")
|
| 593 |
arenas_add = gr.CheckboxGroup(choices=ARENAS, value=["Text"], label="Arenas")
|
|
|
|
| 607 |
|
| 608 |
add_btn.click(add_model, inputs=[new_model, new_provider, arenas_add], outputs=[add_out])
|
| 609 |
|
| 610 |
+
gr.Markdown("#### Sanity")
|
| 611 |
sanity = gr.JSON()
|
| 612 |
+
|
| 613 |
def sanity_check():
|
| 614 |
return {
|
| 615 |
"time_utc": now_iso(),
|
|
|
|
| 620 |
"providers_detected": providers_list(),
|
| 621 |
"models_count": len(all_models()),
|
| 622 |
}
|
| 623 |
+
|
| 624 |
demo.load(sanity_check, outputs=[sanity])
|
| 625 |
|
| 626 |
gr.close_all()
|