ZENLLC commited on
Commit
4f09bb6
·
verified ·
1 Parent(s): 54d6710

Create scripts/sync_lmarena.py

Browse files
Files changed (1) hide show
  1. scripts/sync_lmarena.py +81 -0
scripts/sync_lmarena.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # scripts/sync_lmarena.py
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ import os
6
+ import re
7
+ from datetime import datetime, timezone
8
+ from pathlib import Path
9
+ from typing import Optional, Tuple
10
+
11
+ from huggingface_hub import HfApi, hf_hub_download
12
+
13
+ UPSTREAM_SPACE = "lmarena-ai/lmarena-leaderboard" # source Space
14
+ LOCAL_DIR = Path("data/lmarena")
15
+ LOCAL_CSV = LOCAL_DIR / "leaderboard_table_latest.csv"
16
+ LOCAL_META = LOCAL_DIR / "sync_meta.json"
17
+
18
+ LEADERBOARD_RE = re.compile(r"^leaderboard_table_(\d{8})\.csv$")
19
+
20
+
21
+ def _pick_latest_leaderboard_file(files: list[str]) -> Optional[Tuple[str, str]]:
22
+ """
23
+ Returns (filename, yyyymmdd) for the newest leaderboard_table_YYYYMMDD.csv found.
24
+ """
25
+ candidates: list[Tuple[str, str]] = []
26
+ for f in files:
27
+ m = LEADERBOARD_RE.match(f)
28
+ if m:
29
+ candidates.append((f, m.group(1)))
30
+
31
+ if not candidates:
32
+ return None
33
+
34
+ # Sort by date string; YYYYMMDD sorts lexicographically correctly
35
+ candidates.sort(key=lambda x: x[1])
36
+ return candidates[-1]
37
+
38
+
39
+ def main() -> int:
40
+ token = os.getenv("HF_TOKEN") # optional for public, but recommended for rate limits
41
+ api = HfApi(token=token)
42
+
43
+ # list files from the upstream *space repo*
44
+ files = api.list_repo_files(repo_id=UPSTREAM_SPACE, repo_type="space")
45
+
46
+ latest = _pick_latest_leaderboard_file(files)
47
+ if not latest:
48
+ raise RuntimeError(
49
+ f"No leaderboard_table_YYYYMMDD.csv found in upstream Space: {UPSTREAM_SPACE}"
50
+ )
51
+
52
+ filename, yyyymmdd = latest
53
+
54
+ # Download the raw file to a temp location (hub cache) then copy to our repo path
55
+ downloaded_path = hf_hub_download(
56
+ repo_id=UPSTREAM_SPACE,
57
+ repo_type="space",
58
+ filename=filename,
59
+ token=token,
60
+ )
61
+
62
+ LOCAL_DIR.mkdir(parents=True, exist_ok=True)
63
+
64
+ # Copy file contents to our tracked path
65
+ Path(downloaded_path).replace(LOCAL_CSV) if False else LOCAL_CSV.write_bytes(Path(downloaded_path).read_bytes())
66
+
67
+ meta = {
68
+ "source_space": UPSTREAM_SPACE,
69
+ "source_filename": filename,
70
+ "source_date": yyyymmdd,
71
+ "synced_at_utc": datetime.now(timezone.utc).isoformat(),
72
+ }
73
+ LOCAL_META.write_text(json.dumps(meta, indent=2), encoding="utf-8")
74
+
75
+ print(f"[OK] Synced {filename} -> {LOCAL_CSV}")
76
+ print(json.dumps(meta, indent=2))
77
+ return 0
78
+
79
+
80
+ if __name__ == "__main__":
81
+ raise SystemExit(main())