ZENLLC commited on
Commit
54d6710
·
verified ·
1 Parent(s): 5e356e4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +155 -141
app.py CHANGED
@@ -2,7 +2,6 @@ import json
2
  import os
3
  import random
4
  import sqlite3
5
- from dataclasses import dataclass
6
  from datetime import datetime
7
  from typing import Dict, List, Tuple, Optional
8
 
@@ -33,12 +32,13 @@ CSS_PATH = os.path.join("assets", "zen.css")
33
  DEFAULT_RATING = 1200.0
34
  K_FACTOR = 16.0
35
 
 
36
  # ---------------------------
37
- # Helpers: provider tagging (heuristic)
38
  # ---------------------------
39
  def guess_provider(model: str) -> str:
40
- m = model.lower()
41
- if "gpt" in m or "chatgpt" in m or "o3" in m:
42
  return "OpenAI"
43
  if "gemini" in m or "veo" in m:
44
  return "Google"
@@ -52,8 +52,6 @@ def guess_provider(model: str) -> str:
52
  return "Black Forest Labs"
53
  if "kling" in m:
54
  return "Kuaishou"
55
- if "sora" in m:
56
- return "OpenAI"
57
  if "wan" in m:
58
  return "WAN"
59
  if "hunyuan" in m:
@@ -62,10 +60,11 @@ def guess_provider(model: str) -> str:
62
  return "ByteDance"
63
  return "Other"
64
 
 
65
  # ---------------------------
66
  # SQLite persistence
67
  # ---------------------------
68
- def db():
69
  conn = sqlite3.connect(DB_PATH, check_same_thread=False)
70
  conn.execute(
71
  """
@@ -94,11 +93,19 @@ def db():
94
  conn.commit()
95
  return conn
96
 
 
97
  def now_iso() -> str:
98
  return datetime.utcnow().isoformat(timespec="seconds") + "Z"
99
 
100
- def ensure_model(arena: str, model: str, provider: Optional[str] = None, default_rating: float = DEFAULT_RATING):
101
- provider = provider or guess_provider(model)
 
 
 
 
 
 
 
102
  with FileLock(DB_LOCK):
103
  conn = db()
104
  conn.execute(
@@ -108,6 +115,7 @@ def ensure_model(arena: str, model: str, provider: Optional[str] = None, default
108
  conn.commit()
109
  conn.close()
110
 
 
111
  def get_rating(arena: str, model: str) -> Tuple[float, int, str]:
112
  with FileLock(DB_LOCK):
113
  conn = db()
@@ -115,10 +123,12 @@ def get_rating(arena: str, model: str) -> Tuple[float, int, str]:
115
  cur.execute("SELECT rating, votes, provider FROM ratings WHERE arena=? AND model=?", (arena, model))
116
  row = cur.fetchone()
117
  conn.close()
 
118
  if row is None:
119
  return (DEFAULT_RATING, 0, guess_provider(model))
120
  return (float(row[0]), int(row[1]), str(row[2]))
121
 
 
122
  def elo_update(r_a: float, r_b: float, a_wins: bool, k: float = K_FACTOR) -> Tuple[float, float]:
123
  ea = 1.0 / (1.0 + 10 ** ((r_b - r_a) / 400.0))
124
  sa = 1.0 if a_wins else 0.0
@@ -126,12 +136,14 @@ def elo_update(r_a: float, r_b: float, a_wins: bool, k: float = K_FACTOR) -> Tup
126
  new_b = r_b + k * ((1.0 - sa) - (1.0 - ea))
127
  return new_a, new_b
128
 
129
- def vote(arena: str, winner: str, loser: str):
 
130
  ensure_model(arena, winner)
131
  ensure_model(arena, loser)
132
 
133
  r_w, v_w, p_w = get_rating(arena, winner)
134
  r_l, v_l, p_l = get_rating(arena, loser)
 
135
  new_w, new_l = elo_update(r_w, r_l, True)
136
 
137
  with FileLock(DB_LOCK):
@@ -151,9 +163,10 @@ def vote(arena: str, winner: str, loser: str):
151
  conn.commit()
152
  conn.close()
153
 
154
- def seed_from_json(force: bool = False) -> Dict[str, int]:
 
155
  if not os.path.exists(SEED_PATH):
156
- return {"seeded_rows": 0, "note": "seed file missing"}
157
 
158
  with open(SEED_PATH, "r", encoding="utf-8") as f:
159
  seed = json.load(f)
@@ -165,13 +178,16 @@ def seed_from_json(force: bool = False) -> Dict[str, int]:
165
 
166
  if force:
167
  cur.execute("DELETE FROM ratings")
 
168
  conn.commit()
169
 
170
  for arena, rows in seed.items():
171
  if arena not in ARENAS:
172
  continue
173
  for item in rows:
174
- model = item["model"]
 
 
175
  score = float(item.get("score", DEFAULT_RATING))
176
  votes_n = int(item.get("votes", 0))
177
  provider = guess_provider(model)
@@ -188,9 +204,10 @@ def seed_from_json(force: bool = False) -> Dict[str, int]:
188
  conn.commit()
189
  conn.close()
190
 
191
- return {"seeded_rows": seeded, "note": "ok"}
 
192
 
193
- def ensure_seed_once():
194
  with FileLock(DB_LOCK):
195
  conn = db()
196
  cur = conn.cursor()
@@ -200,26 +217,54 @@ def ensure_seed_once():
200
  if n == 0:
201
  seed_from_json(force=False)
202
 
 
203
  # ---------------------------
204
- # Leaderboards / Matrices
205
  # ---------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  def leaderboard_df(arena: str, search: str = "", provider: str = "All", min_votes: int = 0, limit: int = 100) -> pd.DataFrame:
207
  ensure_seed_once()
208
- search = (search or "").strip()
 
 
209
  provider = provider or "All"
 
210
 
211
  where = ["arena = ?"]
212
  params: List[object] = [arena]
213
 
214
  if search:
215
  where.append("LOWER(model) LIKE ?")
216
- params.append(f"%{search.lower()}%")
 
217
  if provider != "All":
218
  where.append("provider = ?")
219
  params.append(provider)
 
220
  if min_votes > 0:
221
  where.append("votes >= ?")
222
- params.append(int(min_votes))
223
 
224
  where_sql = " AND ".join(where)
225
  q = f"""
@@ -244,39 +289,18 @@ def leaderboard_df(arena: str, search: str = "", provider: str = "All", min_vote
244
  df.insert(0, "Rank", np.arange(1, len(df) + 1))
245
  return df[["Rank", "Model", "Provider", "Score", "Votes", "Updated"]]
246
 
247
- def providers_list() -> List[str]:
248
- ensure_seed_once()
249
- with FileLock(DB_LOCK):
250
- conn = db()
251
- cur = conn.cursor()
252
- cur.execute("SELECT DISTINCT provider FROM ratings ORDER BY provider ASC")
253
- rows = [r[0] for r in cur.fetchall()]
254
- conn.close()
255
- return ["All"] + rows
256
-
257
- def all_models() -> List[str]:
258
- ensure_seed_once()
259
- with FileLock(DB_LOCK):
260
- conn = db()
261
- cur = conn.cursor()
262
- cur.execute("SELECT DISTINCT model FROM ratings ORDER BY model ASC")
263
- rows = [r[0] for r in cur.fetchall()]
264
- conn.close()
265
- return rows
266
 
267
  def arena_overview_matrix(search: str = "", provider: str = "All", min_votes: int = 0, limit_models: int = 200) -> pd.DataFrame:
268
- """
269
- Wide matrix: each model gets Rank/Score/Votes per arena (compact).
270
- """
271
  ensure_seed_once()
 
272
  search = (search or "").strip().lower()
 
 
 
273
 
274
  with FileLock(DB_LOCK):
275
  conn = db()
276
- base = pd.read_sql_query(
277
- "SELECT arena, model, provider, rating, votes FROM ratings",
278
- conn
279
- )
280
  conn.close()
281
 
282
  if base.empty:
@@ -285,15 +309,16 @@ def arena_overview_matrix(search: str = "", provider: str = "All", min_votes: in
285
  if provider != "All":
286
  base = base[base["provider"] == provider]
287
  if min_votes > 0:
288
- base = base[base["votes"] >= int(min_votes)]
289
  if search:
290
  base = base[base["model"].str.lower().str.contains(search, na=False)]
291
 
292
- # rank within arena
 
 
293
  base["rank"] = base.groupby("arena")["rating"].rank(ascending=False, method="min").astype(int)
294
  base["score"] = base["rating"].round().astype(int)
295
 
296
- # choose top N models by "best average rank across arenas they appear in"
297
  pivot_rank = base.pivot_table(index=["model", "provider"], columns="arena", values="rank", aggfunc="min")
298
  avg_rank = pivot_rank.mean(axis=1, skipna=True).sort_values()
299
  chosen = avg_rank.head(limit_models).index
@@ -301,31 +326,31 @@ def arena_overview_matrix(search: str = "", provider: str = "All", min_votes: in
301
  base = base.set_index(["model", "provider"])
302
  base = base.loc[base.index.isin(chosen)].reset_index()
303
 
304
- # build matrix columns
305
- out = pd.DataFrame({"Model": [m for (m, p) in chosen], "Provider": [p for (m, p) in chosen]})
306
- out = out.reset_index(drop=True)
307
 
308
  for a in ARENAS:
309
- sub = base[base["arena"] == a][["model", "provider", "rank", "score", "votes"]]
310
  sub = sub.rename(columns={
311
  "rank": f"{a} Rank",
312
  "score": f"{a} Score",
313
  "votes": f"{a} Votes",
314
  })
315
  out = out.merge(sub, how="left", left_on=["Model", "Provider"], right_on=["model", "provider"])
316
- out.drop(columns=[c for c in ["model", "provider"] if c in out.columns], inplace=True)
 
 
317
 
318
- # sort by best avg rank
319
  out["_avg_rank"] = out[[f"{a} Rank" for a in ARENAS]].mean(axis=1, skipna=True)
320
  out = out.sort_values("_avg_rank", ascending=True).drop(columns=["_avg_rank"])
321
 
322
- # nicer types
323
  for a in ARENAS:
324
  for col in [f"{a} Rank", f"{a} Score", f"{a} Votes"]:
325
  if col in out.columns:
326
  out[col] = out[col].astype("Int64")
 
327
  return out
328
 
 
329
  def kpis() -> Dict[str, str]:
330
  ensure_seed_once()
331
  with FileLock(DB_LOCK):
@@ -336,7 +361,7 @@ def kpis() -> Dict[str, str]:
336
  cur.execute("SELECT COUNT(*) FROM ratings")
337
  rows = cur.fetchone()[0]
338
  cur.execute("SELECT COUNT(*) FROM votes_log")
339
- votes = cur.fetchone()[0]
340
  cur.execute("SELECT MAX(created_at) FROM votes_log")
341
  last_vote = cur.fetchone()[0]
342
  conn.close()
@@ -344,75 +369,81 @@ def kpis() -> Dict[str, str]:
344
  return {
345
  "models": str(models),
346
  "entries": str(rows),
347
- "votes": str(votes),
348
  "last_vote": last_vote or "—",
349
  }
350
 
 
351
  # ---------------------------
352
- # Voting mechanics UI
353
  # ---------------------------
354
  def pick_pair(arena: str, provider: str = "All") -> Tuple[str, str]:
355
  df = leaderboard_df(arena, provider=provider, min_votes=0, limit=50)
356
  models = df["Model"].tolist()
357
  if len(models) < 2:
358
- models = [m for m in all_models()]
359
  if len(models) < 2:
360
  return ("model-a", "model-b")
361
- a, b = random.sample(models, 2)
362
- return a, b
363
 
364
  def model_card_md(model: str, arena: Optional[str] = None) -> str:
365
  provider = guess_provider(model)
366
- lines = [f"### {model}", f"<span class='zen-badge'>{provider}</span>"]
367
  if arena:
368
- r, v, p = get_rating(arena, model)
369
- lines += [
370
- "",
371
- f"**Arena:** {arena}",
372
- f"**Score:** {int(round(r))}",
373
- f"**Votes:** {v}",
374
- ]
375
- return "\n".join(lines)
376
 
377
  def model_profile(model: str) -> Tuple[pd.DataFrame, str]:
378
  ensure_seed_once()
 
 
 
 
379
  rows = []
380
  for a in ARENAS:
381
  r, v, p = get_rating(a, model)
382
  rows.append({"Arena": a, "Score": int(round(r)), "Votes": v, "Provider": p})
383
  df = pd.DataFrame(rows).sort_values("Score", ascending=False)
 
384
  best = df.iloc[0]
385
  worst = df.iloc[-1]
386
  summary = (
387
- f"<div class='zen-card'>"
388
- f"<div class='zen-title'>Model Profile</div>"
389
  f"<div class='zen-sub'><b>{model}</b> · Provider: <b>{guess_provider(model)}</b></div>"
390
- f"<div class='zen-hr'></div>"
391
- f"<div class='zen-note'>"
392
  f"Best arena: <b>{best['Arena']}</b> (Score {best['Score']}, Votes {best['Votes']}). "
393
  f"Worst arena: <b>{worst['Arena']}</b> (Score {worst['Score']}, Votes {worst['Votes']})."
394
- f"</div>"
395
- f"</div>"
396
  )
397
  return df, summary
398
 
 
399
  # ---------------------------
400
- # Build UI
401
  # ---------------------------
402
  ensure_seed_once()
 
403
  css = ""
404
  if os.path.exists(CSS_PATH):
405
  with open(CSS_PATH, "r", encoding="utf-8") as f:
406
  css = f.read()
407
 
408
- with gr.Blocks(css=css, title="ZEN Model Arena Leaderboard") as demo:
 
 
409
 
410
  k = kpis()
411
  header = f"""
412
  <div class="zen-card">
413
  <div class="zen-title">ZEN Model Arena Leaderboard</div>
414
  <p class="zen-sub">
415
- A multi-arena leaderboard (Text · WebDev · Vision · Image · Video · Search) with an Arena Overview matrix and live Elo voting.
416
  </p>
417
  <div class="zen-kpi">
418
  <div><div class="k">Models</div><div class="v">{k['models']}</div><div class="s">unique IDs tracked</div></div>
@@ -421,9 +452,9 @@ with gr.Blocks(css=css, title="ZEN Model Arena Leaderboard") as demo:
421
  <div><div class="k">Last Vote</div><div class="v" style="font-size:12px; font-weight:700;">{k['last_vote']}</div><div class="s">UTC</div></div>
422
  </div>
423
  <div class="zen-hr"></div>
424
- <span class="zen-badge">Gradio 5.49.1</span>
425
  <span class="zen-badge">SQLite + FileLock</span>
426
- <span class="zen-badge">Multi-Arena Matrix</span>
427
  <span class="zen-badge">Search + Filters</span>
428
  </div>
429
  """
@@ -433,19 +464,17 @@ with gr.Blocks(css=css, title="ZEN Model Arena Leaderboard") as demo:
433
 
434
  with gr.Tabs():
435
 
436
- # ---------------------------
437
  # Overview
438
- # ---------------------------
439
  with gr.Tab("Leaderboard Overview"):
440
- gr.Markdown("### Top 10 per Arena (seed snapshot) + live refresh\nThis view mirrors your snapshot format, but it’s wired to the live DB.")
441
 
442
  with gr.Row():
443
  arena_sel_ov = gr.Dropdown(choices=ARENAS, value="Text", label="Arena")
444
- provider_sel_ov = gr.Dropdown(choices=provider_choices, value="All", label="Provider filter")
445
  min_votes_ov = gr.Slider(0, 100000, value=0, step=50, label="Min votes")
446
- search_ov = gr.Textbox(value="", label="Search models", placeholder="e.g., gpt, gemini, claude, flux...")
447
 
448
- df_ov = gr.Dataframe(interactive=False, wrap=True, height=520)
449
  refresh_ov = gr.Button("Refresh overview", variant="primary")
450
 
451
  def refresh_overview(arena, provider, min_votes, search):
@@ -454,19 +483,17 @@ with gr.Blocks(css=css, title="ZEN Model Arena Leaderboard") as demo:
454
  refresh_ov.click(refresh_overview, inputs=[arena_sel_ov, provider_sel_ov, min_votes_ov, search_ov], outputs=[df_ov])
455
  demo.load(refresh_overview, inputs=[arena_sel_ov, provider_sel_ov, min_votes_ov, search_ov], outputs=[df_ov])
456
 
457
- # ---------------------------
458
- # Arena Overview Matrix (the cool part)
459
- # ---------------------------
460
  with gr.Tab("Arena Overview Matrix"):
461
- gr.Markdown("### Cross-arena placements (rank/score/votes per arena)\nThis is the wide “overview table” that makes the leaderboard feel *real*.")
462
 
463
  with gr.Row():
464
  provider_sel_mx = gr.Dropdown(choices=provider_choices, value="All", label="Provider")
465
  min_votes_mx = gr.Slider(0, 100000, value=0, step=50, label="Min votes")
466
- search_mx = gr.Textbox(value="", label="Search", placeholder="Filter which models appear…")
467
- limit_mx = gr.Slider(10, 400, value=200, step=10, label="Max models in matrix")
468
 
469
- mx = gr.Dataframe(interactive=False, wrap=True, height=600)
470
  refresh_mx = gr.Button("Build / Refresh Matrix", variant="primary")
471
 
472
  def build_matrix(provider, min_votes, search, limit_models):
@@ -475,39 +502,34 @@ with gr.Blocks(css=css, title="ZEN Model Arena Leaderboard") as demo:
475
  refresh_mx.click(build_matrix, inputs=[provider_sel_mx, min_votes_mx, search_mx, limit_mx], outputs=[mx])
476
  demo.load(build_matrix, inputs=[provider_sel_mx, min_votes_mx, search_mx, limit_mx], outputs=[mx])
477
 
478
- # ---------------------------
479
- # Per-arena deep view
480
- # ---------------------------
481
  for arena in ARENAS:
482
  with gr.Tab(arena):
483
- gr.Markdown(f"### {arena} Leaderboard (live DB)\nFilter, search, and export-ready table.")
484
 
485
  with gr.Row():
486
  provider_sel = gr.Dropdown(choices=provider_choices, value="All", label="Provider")
487
  min_votes = gr.Slider(0, 100000, value=0, step=50, label="Min votes")
488
  search = gr.Textbox(value="", label="Search")
489
 
490
- df = gr.Dataframe(interactive=False, wrap=True, height=560)
491
  btn = gr.Button("Refresh", variant="primary")
492
 
493
- btn.click(lambda p, mv, s, a=arena: leaderboard_df(a, search=s, provider=p, min_votes=int(mv), limit=150),
494
- inputs=[provider_sel, min_votes, search], outputs=[df])
495
- demo.load(lambda p="All", mv=0, s="", a=arena: leaderboard_df(a, search=s, provider=p, min_votes=int(mv), limit=150),
496
- outputs=[df])
 
 
497
 
498
- # ---------------------------
499
- # Voting (pairwise)
500
- # ---------------------------
501
  with gr.Tab("Start Voting"):
502
- gr.Markdown(
503
- "### Pairwise Voting (Elo)\n"
504
- "Pick which model wins for a chosen arena. Scores update instantly and feed the Matrix."
505
- )
506
 
507
  with gr.Row():
508
  arena_vote = gr.Dropdown(choices=ARENAS, value="Text", label="Arena")
509
  provider_vote = gr.Dropdown(choices=provider_choices, value="All", label="Provider pool")
510
- new_match = gr.Button("New Matchup", variant="primary")
511
 
512
  left_state = gr.State("")
513
  right_state = gr.State("")
@@ -526,56 +548,46 @@ with gr.Blocks(css=css, title="ZEN Model Arena Leaderboard") as demo:
526
  a, b = pick_pair(arena, provider=provider)
527
  return model_card_md(a, arena), model_card_md(b, arena), a, b, "<div class='zen-note'>New matchup ready.</div>"
528
 
529
- new_match.click(new_matchup, inputs=[arena_vote, provider_vote], outputs=[left_md, right_md, left_state, right_state, vote_status])
530
- demo.load(new_matchup, inputs=[arena_vote, provider_vote], outputs=[left_md, right_md, left_state, right_state, vote_status])
531
-
532
  def left_wins(arena, left, right, provider):
533
- if not left or not right:
534
- return new_matchup(arena, provider)
535
- vote(arena, winner=left, loser=right)
536
  return new_matchup(arena, provider)
537
 
538
  def right_wins(arena, left, right, provider):
539
- if not left or not right:
540
- return new_matchup(arena, provider)
541
- vote(arena, winner=right, loser=left)
542
  return new_matchup(arena, provider)
543
 
544
- left_btn.click(left_wins, inputs=[arena_vote, left_state, right_state, provider_vote],
545
- outputs=[left_md, right_md, left_state, right_state, vote_status])
546
- right_btn.click(right_wins, inputs=[arena_vote, left_state, right_state, provider_vote],
547
- outputs=[left_md, right_md, left_state, right_state, vote_status])
548
 
549
- # ---------------------------
550
- # Model Profiles
551
- # ---------------------------
552
  with gr.Tab("Model Profiles"):
553
- gr.Markdown("### Inspect a model across arenas\nThis is how you turn “a table” into a *product*.")
554
 
555
- model_dd = gr.Dropdown(choices=all_models(), value=all_models()[0] if all_models() else None, label="Model")
 
556
  prof_summary = gr.HTML()
557
- prof_df = gr.Dataframe(interactive=False, wrap=True, height=360)
558
-
559
  load_btn = gr.Button("Load Profile", variant="primary")
 
560
  load_btn.click(model_profile, inputs=[model_dd], outputs=[prof_df, prof_summary])
561
  demo.load(model_profile, inputs=[model_dd], outputs=[prof_df, prof_summary])
562
 
563
- # ---------------------------
564
- # Admin / Ops
565
- # ---------------------------
566
  with gr.Tab("Admin"):
567
- gr.Markdown("### Admin Tools\nSeed DB, force reseed, add models, sanity checks.")
568
 
569
  with gr.Row():
570
- reseed_force = gr.Checkbox(value=False, label="Force reseed (wipes DB first)")
571
  reseed_btn = gr.Button("Seed from data/seed_snapshot.json", variant="primary")
572
 
573
  reseed_out = gr.JSON()
 
574
 
575
- reseed_btn.click(lambda force: seed_from_json(force=bool(force)),
576
- inputs=[reseed_force], outputs=[reseed_out])
577
-
578
- gr.Markdown("#### Add a model to one or more arenas")
579
  new_model = gr.Textbox(label="Model ID", placeholder="e.g., gpt-5.2, gemini-3-pro, claude-opus-4-5-…")
580
  new_provider = gr.Textbox(label="Provider (optional)", placeholder="Leave blank for auto-detect")
581
  arenas_add = gr.CheckboxGroup(choices=ARENAS, value=["Text"], label="Arenas")
@@ -595,8 +607,9 @@ with gr.Blocks(css=css, title="ZEN Model Arena Leaderboard") as demo:
595
 
596
  add_btn.click(add_model, inputs=[new_model, new_provider, arenas_add], outputs=[add_out])
597
 
598
- gr.Markdown("#### Environment sanity")
599
  sanity = gr.JSON()
 
600
  def sanity_check():
601
  return {
602
  "time_utc": now_iso(),
@@ -607,6 +620,7 @@ with gr.Blocks(css=css, title="ZEN Model Arena Leaderboard") as demo:
607
  "providers_detected": providers_list(),
608
  "models_count": len(all_models()),
609
  }
 
610
  demo.load(sanity_check, outputs=[sanity])
611
 
612
  gr.close_all()
 
2
  import os
3
  import random
4
  import sqlite3
 
5
  from datetime import datetime
6
  from typing import Dict, List, Tuple, Optional
7
 
 
32
  DEFAULT_RATING = 1200.0
33
  K_FACTOR = 16.0
34
 
35
+
36
  # ---------------------------
37
+ # Provider tagging (heuristic)
38
  # ---------------------------
39
  def guess_provider(model: str) -> str:
40
+ m = (model or "").lower()
41
+ if "gpt" in m or "chatgpt" in m or m.startswith("o3"):
42
  return "OpenAI"
43
  if "gemini" in m or "veo" in m:
44
  return "Google"
 
52
  return "Black Forest Labs"
53
  if "kling" in m:
54
  return "Kuaishou"
 
 
55
  if "wan" in m:
56
  return "WAN"
57
  if "hunyuan" in m:
 
60
  return "ByteDance"
61
  return "Other"
62
 
63
+
64
  # ---------------------------
65
  # SQLite persistence
66
  # ---------------------------
67
+ def db() -> sqlite3.Connection:
68
  conn = sqlite3.connect(DB_PATH, check_same_thread=False)
69
  conn.execute(
70
  """
 
93
  conn.commit()
94
  return conn
95
 
96
+
97
  def now_iso() -> str:
98
  return datetime.utcnow().isoformat(timespec="seconds") + "Z"
99
 
100
+
101
+ def ensure_model(arena: str, model: str, provider: Optional[str] = None, default_rating: float = DEFAULT_RATING) -> None:
102
+ if arena not in ARENAS:
103
+ return
104
+ model = (model or "").strip()
105
+ if not model:
106
+ return
107
+ provider = (provider or "").strip() or guess_provider(model)
108
+
109
  with FileLock(DB_LOCK):
110
  conn = db()
111
  conn.execute(
 
115
  conn.commit()
116
  conn.close()
117
 
118
+
119
  def get_rating(arena: str, model: str) -> Tuple[float, int, str]:
120
  with FileLock(DB_LOCK):
121
  conn = db()
 
123
  cur.execute("SELECT rating, votes, provider FROM ratings WHERE arena=? AND model=?", (arena, model))
124
  row = cur.fetchone()
125
  conn.close()
126
+
127
  if row is None:
128
  return (DEFAULT_RATING, 0, guess_provider(model))
129
  return (float(row[0]), int(row[1]), str(row[2]))
130
 
131
+
132
  def elo_update(r_a: float, r_b: float, a_wins: bool, k: float = K_FACTOR) -> Tuple[float, float]:
133
  ea = 1.0 / (1.0 + 10 ** ((r_b - r_a) / 400.0))
134
  sa = 1.0 if a_wins else 0.0
 
136
  new_b = r_b + k * ((1.0 - sa) - (1.0 - ea))
137
  return new_a, new_b
138
 
139
+
140
+ def vote(arena: str, winner: str, loser: str) -> None:
141
  ensure_model(arena, winner)
142
  ensure_model(arena, loser)
143
 
144
  r_w, v_w, p_w = get_rating(arena, winner)
145
  r_l, v_l, p_l = get_rating(arena, loser)
146
+
147
  new_w, new_l = elo_update(r_w, r_l, True)
148
 
149
  with FileLock(DB_LOCK):
 
163
  conn.commit()
164
  conn.close()
165
 
166
+
167
+ def seed_from_json(force: bool = False) -> Dict[str, object]:
168
  if not os.path.exists(SEED_PATH):
169
+ return {"ok": False, "seeded_rows": 0, "note": "Missing data/seed_snapshot.json"}
170
 
171
  with open(SEED_PATH, "r", encoding="utf-8") as f:
172
  seed = json.load(f)
 
178
 
179
  if force:
180
  cur.execute("DELETE FROM ratings")
181
+ cur.execute("DELETE FROM votes_log")
182
  conn.commit()
183
 
184
  for arena, rows in seed.items():
185
  if arena not in ARENAS:
186
  continue
187
  for item in rows:
188
+ model = str(item.get("model", "")).strip()
189
+ if not model:
190
+ continue
191
  score = float(item.get("score", DEFAULT_RATING))
192
  votes_n = int(item.get("votes", 0))
193
  provider = guess_provider(model)
 
204
  conn.commit()
205
  conn.close()
206
 
207
+ return {"ok": True, "seeded_rows": seeded, "note": "Seeded successfully"}
208
+
209
 
210
+ def ensure_seed_once() -> None:
211
  with FileLock(DB_LOCK):
212
  conn = db()
213
  cur = conn.cursor()
 
217
  if n == 0:
218
  seed_from_json(force=False)
219
 
220
+
221
  # ---------------------------
222
+ # Query helpers
223
  # ---------------------------
224
+ def providers_list() -> List[str]:
225
+ ensure_seed_once()
226
+ with FileLock(DB_LOCK):
227
+ conn = db()
228
+ cur = conn.cursor()
229
+ cur.execute("SELECT DISTINCT provider FROM ratings ORDER BY provider ASC")
230
+ rows = [r[0] for r in cur.fetchall()]
231
+ conn.close()
232
+ return ["All"] + rows
233
+
234
+
235
+ def all_models() -> List[str]:
236
+ ensure_seed_once()
237
+ with FileLock(DB_LOCK):
238
+ conn = db()
239
+ cur = conn.cursor()
240
+ cur.execute("SELECT DISTINCT model FROM ratings ORDER BY model ASC")
241
+ rows = [r[0] for r in cur.fetchall()]
242
+ conn.close()
243
+ return rows
244
+
245
+
246
  def leaderboard_df(arena: str, search: str = "", provider: str = "All", min_votes: int = 0, limit: int = 100) -> pd.DataFrame:
247
  ensure_seed_once()
248
+
249
+ arena = arena if arena in ARENAS else "Text"
250
+ search = (search or "").strip().lower()
251
  provider = provider or "All"
252
+ min_votes = int(min_votes or 0)
253
 
254
  where = ["arena = ?"]
255
  params: List[object] = [arena]
256
 
257
  if search:
258
  where.append("LOWER(model) LIKE ?")
259
+ params.append(f"%{search}%")
260
+
261
  if provider != "All":
262
  where.append("provider = ?")
263
  params.append(provider)
264
+
265
  if min_votes > 0:
266
  where.append("votes >= ?")
267
+ params.append(min_votes)
268
 
269
  where_sql = " AND ".join(where)
270
  q = f"""
 
289
  df.insert(0, "Rank", np.arange(1, len(df) + 1))
290
  return df[["Rank", "Model", "Provider", "Score", "Votes", "Updated"]]
291
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
293
  def arena_overview_matrix(search: str = "", provider: str = "All", min_votes: int = 0, limit_models: int = 200) -> pd.DataFrame:
 
 
 
294
  ensure_seed_once()
295
+
296
  search = (search or "").strip().lower()
297
+ provider = provider or "All"
298
+ min_votes = int(min_votes or 0)
299
+ limit_models = int(limit_models or 200)
300
 
301
  with FileLock(DB_LOCK):
302
  conn = db()
303
+ base = pd.read_sql_query("SELECT arena, model, provider, rating, votes FROM ratings", conn)
 
 
 
304
  conn.close()
305
 
306
  if base.empty:
 
309
  if provider != "All":
310
  base = base[base["provider"] == provider]
311
  if min_votes > 0:
312
+ base = base[base["votes"] >= min_votes]
313
  if search:
314
  base = base[base["model"].str.lower().str.contains(search, na=False)]
315
 
316
+ if base.empty:
317
+ return pd.DataFrame()
318
+
319
  base["rank"] = base.groupby("arena")["rating"].rank(ascending=False, method="min").astype(int)
320
  base["score"] = base["rating"].round().astype(int)
321
 
 
322
  pivot_rank = base.pivot_table(index=["model", "provider"], columns="arena", values="rank", aggfunc="min")
323
  avg_rank = pivot_rank.mean(axis=1, skipna=True).sort_values()
324
  chosen = avg_rank.head(limit_models).index
 
326
  base = base.set_index(["model", "provider"])
327
  base = base.loc[base.index.isin(chosen)].reset_index()
328
 
329
+ out = pd.DataFrame({"Model": [m for (m, p) in chosen], "Provider": [p for (m, p) in chosen]}).reset_index(drop=True)
 
 
330
 
331
  for a in ARENAS:
332
+ sub = base[base["arena"] == a][["model", "provider", "rank", "score", "votes"]].copy()
333
  sub = sub.rename(columns={
334
  "rank": f"{a} Rank",
335
  "score": f"{a} Score",
336
  "votes": f"{a} Votes",
337
  })
338
  out = out.merge(sub, how="left", left_on=["Model", "Provider"], right_on=["model", "provider"])
339
+ for c in ["model", "provider"]:
340
+ if c in out.columns:
341
+ out.drop(columns=[c], inplace=True)
342
 
 
343
  out["_avg_rank"] = out[[f"{a} Rank" for a in ARENAS]].mean(axis=1, skipna=True)
344
  out = out.sort_values("_avg_rank", ascending=True).drop(columns=["_avg_rank"])
345
 
 
346
  for a in ARENAS:
347
  for col in [f"{a} Rank", f"{a} Score", f"{a} Votes"]:
348
  if col in out.columns:
349
  out[col] = out[col].astype("Int64")
350
+
351
  return out
352
 
353
+
354
  def kpis() -> Dict[str, str]:
355
  ensure_seed_once()
356
  with FileLock(DB_LOCK):
 
361
  cur.execute("SELECT COUNT(*) FROM ratings")
362
  rows = cur.fetchone()[0]
363
  cur.execute("SELECT COUNT(*) FROM votes_log")
364
+ votes_n = cur.fetchone()[0]
365
  cur.execute("SELECT MAX(created_at) FROM votes_log")
366
  last_vote = cur.fetchone()[0]
367
  conn.close()
 
369
  return {
370
  "models": str(models),
371
  "entries": str(rows),
372
+ "votes": str(votes_n),
373
  "last_vote": last_vote or "—",
374
  }
375
 
376
+
377
  # ---------------------------
378
+ # Voting / profiles
379
  # ---------------------------
380
  def pick_pair(arena: str, provider: str = "All") -> Tuple[str, str]:
381
  df = leaderboard_df(arena, provider=provider, min_votes=0, limit=50)
382
  models = df["Model"].tolist()
383
  if len(models) < 2:
384
+ models = all_models()
385
  if len(models) < 2:
386
  return ("model-a", "model-b")
387
+ return tuple(random.sample(models, 2))
388
+
389
 
390
  def model_card_md(model: str, arena: Optional[str] = None) -> str:
391
  provider = guess_provider(model)
392
+ out = [f"### {model}", f"<span class='zen-badge'>{provider}</span>"]
393
  if arena:
394
+ r, v, _ = get_rating(arena, model)
395
+ out += ["", f"**Arena:** {arena}", f"**Score:** {int(round(r))}", f"**Votes:** {v}"]
396
+ return "\n".join(out)
397
+
 
 
 
 
398
 
399
  def model_profile(model: str) -> Tuple[pd.DataFrame, str]:
400
  ensure_seed_once()
401
+ model = (model or "").strip()
402
+ if not model:
403
+ return pd.DataFrame(columns=["Arena", "Score", "Votes", "Provider"]), "<div class='zen-card'>No model selected.</div>"
404
+
405
  rows = []
406
  for a in ARENAS:
407
  r, v, p = get_rating(a, model)
408
  rows.append({"Arena": a, "Score": int(round(r)), "Votes": v, "Provider": p})
409
  df = pd.DataFrame(rows).sort_values("Score", ascending=False)
410
+
411
  best = df.iloc[0]
412
  worst = df.iloc[-1]
413
  summary = (
414
+ "<div class='zen-card'>"
415
+ "<div class='zen-title'>Model Profile</div>"
416
  f"<div class='zen-sub'><b>{model}</b> · Provider: <b>{guess_provider(model)}</b></div>"
417
+ "<div class='zen-hr'></div>"
418
+ "<div class='zen-note'>"
419
  f"Best arena: <b>{best['Arena']}</b> (Score {best['Score']}, Votes {best['Votes']}). "
420
  f"Worst arena: <b>{worst['Arena']}</b> (Score {worst['Score']}, Votes {worst['Votes']})."
421
+ "</div>"
422
+ "</div>"
423
  )
424
  return df, summary
425
 
426
+
427
  # ---------------------------
428
+ # App UI
429
  # ---------------------------
430
  ensure_seed_once()
431
+
432
  css = ""
433
  if os.path.exists(CSS_PATH):
434
  with open(CSS_PATH, "r", encoding="utf-8") as f:
435
  css = f.read()
436
 
437
+ with gr.Blocks(title="ZEN Model Arena Leaderboard") as demo:
438
+ if css:
439
+ gr.HTML(f"<style>{css}</style>")
440
 
441
  k = kpis()
442
  header = f"""
443
  <div class="zen-card">
444
  <div class="zen-title">ZEN Model Arena Leaderboard</div>
445
  <p class="zen-sub">
446
+ Multi-arena rankings (Text · WebDev · Vision · Image · Video · Search) with a cross-arena overview matrix and live Elo voting.
447
  </p>
448
  <div class="zen-kpi">
449
  <div><div class="k">Models</div><div class="v">{k['models']}</div><div class="s">unique IDs tracked</div></div>
 
452
  <div><div class="k">Last Vote</div><div class="v" style="font-size:12px; font-weight:700;">{k['last_vote']}</div><div class="s">UTC</div></div>
453
  </div>
454
  <div class="zen-hr"></div>
455
+ <span class="zen-badge">Gradio 6.2.0</span>
456
  <span class="zen-badge">SQLite + FileLock</span>
457
+ <span class="zen-badge">Arena Matrix</span>
458
  <span class="zen-badge">Search + Filters</span>
459
  </div>
460
  """
 
464
 
465
  with gr.Tabs():
466
 
 
467
  # Overview
 
468
  with gr.Tab("Leaderboard Overview"):
469
+ gr.Markdown("### Top 10 (live DB)\nMirrors the snapshot format, but runs off the DB.")
470
 
471
  with gr.Row():
472
  arena_sel_ov = gr.Dropdown(choices=ARENAS, value="Text", label="Arena")
473
+ provider_sel_ov = gr.Dropdown(choices=provider_choices, value="All", label="Provider")
474
  min_votes_ov = gr.Slider(0, 100000, value=0, step=50, label="Min votes")
475
+ search_ov = gr.Textbox(value="", label="Search models", placeholder="gpt, gemini, claude, flux...")
476
 
477
+ df_ov = gr.Dataframe(interactive=False, wrap=True, elem_classes=["zen-table-520"])
478
  refresh_ov = gr.Button("Refresh overview", variant="primary")
479
 
480
  def refresh_overview(arena, provider, min_votes, search):
 
483
  refresh_ov.click(refresh_overview, inputs=[arena_sel_ov, provider_sel_ov, min_votes_ov, search_ov], outputs=[df_ov])
484
  demo.load(refresh_overview, inputs=[arena_sel_ov, provider_sel_ov, min_votes_ov, search_ov], outputs=[df_ov])
485
 
486
+ # Matrix
 
 
487
  with gr.Tab("Arena Overview Matrix"):
488
+ gr.Markdown("### Cross-arena placements\nRank/Score/Votes per arena in one wide matrix.")
489
 
490
  with gr.Row():
491
  provider_sel_mx = gr.Dropdown(choices=provider_choices, value="All", label="Provider")
492
  min_votes_mx = gr.Slider(0, 100000, value=0, step=50, label="Min votes")
493
+ search_mx = gr.Textbox(value="", label="Search")
494
+ limit_mx = gr.Slider(10, 400, value=200, step=10, label="Max models")
495
 
496
+ mx = gr.Dataframe(interactive=False, wrap=True, elem_classes=["zen-table-600"])
497
  refresh_mx = gr.Button("Build / Refresh Matrix", variant="primary")
498
 
499
  def build_matrix(provider, min_votes, search, limit_models):
 
502
  refresh_mx.click(build_matrix, inputs=[provider_sel_mx, min_votes_mx, search_mx, limit_mx], outputs=[mx])
503
  demo.load(build_matrix, inputs=[provider_sel_mx, min_votes_mx, search_mx, limit_mx], outputs=[mx])
504
 
505
+ # Arena tabs
 
 
506
  for arena in ARENAS:
507
  with gr.Tab(arena):
508
+ gr.Markdown(f"### {arena} Leaderboard (live DB)")
509
 
510
  with gr.Row():
511
  provider_sel = gr.Dropdown(choices=provider_choices, value="All", label="Provider")
512
  min_votes = gr.Slider(0, 100000, value=0, step=50, label="Min votes")
513
  search = gr.Textbox(value="", label="Search")
514
 
515
+ df = gr.Dataframe(interactive=False, wrap=True, elem_classes=["zen-table-600"])
516
  btn = gr.Button("Refresh", variant="primary")
517
 
518
+ btn.click(
519
+ lambda p, mv, s, a=arena: leaderboard_df(a, search=s, provider=p, min_votes=int(mv), limit=150),
520
+ inputs=[provider_sel, min_votes, search],
521
+ outputs=[df],
522
+ )
523
+ demo.load(lambda a=arena: leaderboard_df(a, limit=150), outputs=[df])
524
 
525
+ # Voting
 
 
526
  with gr.Tab("Start Voting"):
527
+ gr.Markdown("### Pairwise Voting (Elo)\nPick a winner for a specific arena. Scores update instantly.")
 
 
 
528
 
529
  with gr.Row():
530
  arena_vote = gr.Dropdown(choices=ARENAS, value="Text", label="Arena")
531
  provider_vote = gr.Dropdown(choices=provider_choices, value="All", label="Provider pool")
532
+ new_match_btn = gr.Button("New Matchup", variant="primary")
533
 
534
  left_state = gr.State("")
535
  right_state = gr.State("")
 
548
  a, b = pick_pair(arena, provider=provider)
549
  return model_card_md(a, arena), model_card_md(b, arena), a, b, "<div class='zen-note'>New matchup ready.</div>"
550
 
 
 
 
551
  def left_wins(arena, left, right, provider):
552
+ if left and right:
553
+ vote(arena, winner=left, loser=right)
 
554
  return new_matchup(arena, provider)
555
 
556
  def right_wins(arena, left, right, provider):
557
+ if left and right:
558
+ vote(arena, winner=right, loser=left)
 
559
  return new_matchup(arena, provider)
560
 
561
+ new_match_btn.click(new_matchup, inputs=[arena_vote, provider_vote], outputs=[left_md, right_md, left_state, right_state, vote_status])
562
+ left_btn.click(left_wins, inputs=[arena_vote, left_state, right_state, provider_vote], outputs=[left_md, right_md, left_state, right_state, vote_status])
563
+ right_btn.click(right_wins, inputs=[arena_vote, left_state, right_state, provider_vote], outputs=[left_md, right_md, left_state, right_state, vote_status])
564
+ demo.load(new_matchup, inputs=[arena_vote, provider_vote], outputs=[left_md, right_md, left_state, right_state, vote_status])
565
 
566
+ # Profiles
 
 
567
  with gr.Tab("Model Profiles"):
568
+ gr.Markdown("### Inspect a model across arenas")
569
 
570
+ models = all_models()
571
+ model_dd = gr.Dropdown(choices=models, value=(models[0] if models else None), label="Model")
572
  prof_summary = gr.HTML()
573
+ prof_df = gr.Dataframe(interactive=False, wrap=True, elem_classes=["zen-table-520"])
 
574
  load_btn = gr.Button("Load Profile", variant="primary")
575
+
576
  load_btn.click(model_profile, inputs=[model_dd], outputs=[prof_df, prof_summary])
577
  demo.load(model_profile, inputs=[model_dd], outputs=[prof_df, prof_summary])
578
 
579
+ # Admin
 
 
580
  with gr.Tab("Admin"):
581
+ gr.Markdown("### Admin Tools")
582
 
583
  with gr.Row():
584
+ reseed_force = gr.Checkbox(value=False, label="Force reseed (wipe DB first)")
585
  reseed_btn = gr.Button("Seed from data/seed_snapshot.json", variant="primary")
586
 
587
  reseed_out = gr.JSON()
588
+ reseed_btn.click(lambda force: seed_from_json(force=bool(force)), inputs=[reseed_force], outputs=[reseed_out])
589
 
590
+ gr.Markdown("#### Add a model to arenas")
 
 
 
591
  new_model = gr.Textbox(label="Model ID", placeholder="e.g., gpt-5.2, gemini-3-pro, claude-opus-4-5-…")
592
  new_provider = gr.Textbox(label="Provider (optional)", placeholder="Leave blank for auto-detect")
593
  arenas_add = gr.CheckboxGroup(choices=ARENAS, value=["Text"], label="Arenas")
 
607
 
608
  add_btn.click(add_model, inputs=[new_model, new_provider, arenas_add], outputs=[add_out])
609
 
610
+ gr.Markdown("#### Sanity")
611
  sanity = gr.JSON()
612
+
613
  def sanity_check():
614
  return {
615
  "time_utc": now_iso(),
 
620
  "providers_detected": providers_list(),
621
  "models_count": len(all_models()),
622
  }
623
+
624
  demo.load(sanity_check, outputs=[sanity])
625
 
626
  gr.close_all()