Hao Xu
commited on
Commit
Β·
3c856c0
1
Parent(s):
b58437b
leaderboard UI
Browse files- app.py +101 -0
- community_results.json +3 -0
- data.json +31 -0
- requirements.txt +2 -0
app.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
def load_data(source):
|
| 7 |
+
data = []
|
| 8 |
+
if source == "core":
|
| 9 |
+
with open("data.json", "r") as file:
|
| 10 |
+
data = json.load(file)
|
| 11 |
+
elif source == "community":
|
| 12 |
+
with open("community_results.json", "r") as file:
|
| 13 |
+
data = json.load(file)
|
| 14 |
+
return data
|
| 15 |
+
|
| 16 |
+
def build_table(source):
|
| 17 |
+
data = load_data(source)
|
| 18 |
+
entries = []
|
| 19 |
+
|
| 20 |
+
for entry in data:
|
| 21 |
+
entries.append({
|
| 22 |
+
"Benchmark": entry.get("Benchmark", ""),
|
| 23 |
+
"Category": entry.get("Category", ""),
|
| 24 |
+
"Pile Dirty (%)": entry.get("Pile Dirty", ""),
|
| 25 |
+
"DCLM Dirty (%)": entry.get("DCLM Dirty", ""),
|
| 26 |
+
"CC Dirty (%)": entry.get("CC202505 Dirty", ""),
|
| 27 |
+
})
|
| 28 |
+
|
| 29 |
+
return pd.DataFrame(entries).sort_values(by="Pile Dirty (%)", ascending=False)
|
| 30 |
+
|
| 31 |
+
def record_submission(jsonl_file, hf_path, field_name):
|
| 32 |
+
if jsonl_file is None and not hf_path:
|
| 33 |
+
return "Please provide either a .jsonl file or a HuggingFace dataset path."
|
| 34 |
+
|
| 35 |
+
entry = {
|
| 36 |
+
"source": hf_path if hf_path else jsonl_file.name,
|
| 37 |
+
"type": "hf" if hf_path else "jsonl",
|
| 38 |
+
"field_name": field_name,
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
queue_file = "pending_submissions.json"
|
| 42 |
+
existing = []
|
| 43 |
+
if os.path.exists(queue_file):
|
| 44 |
+
with open(queue_file, "r") as f:
|
| 45 |
+
existing = json.load(f)
|
| 46 |
+
existing.append(entry)
|
| 47 |
+
with open(queue_file, "w") as f:
|
| 48 |
+
json.dump(existing, f, indent=2)
|
| 49 |
+
|
| 50 |
+
return "β
Submission received! You'll be notified when processing is complete."
|
| 51 |
+
|
| 52 |
+
with gr.Blocks() as interface:
|
| 53 |
+
gr.Markdown("# π Benchmark Contamination Bulletin")
|
| 54 |
+
|
| 55 |
+
with gr.Tabs():
|
| 56 |
+
with gr.Tab(label="Leaderboard"):
|
| 57 |
+
|
| 58 |
+
source_radio = gr.Radio(
|
| 59 |
+
choices=["core", "community"],
|
| 60 |
+
label="Select Benchmark Source",
|
| 61 |
+
value="core"
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
table_columns = ["Benchmark", "Category", "Pile Dirty (%)", "DCLM Dirty (%)", "CC202505 Dirty (%)"]
|
| 65 |
+
leaderboard_table = gr.Dataframe(
|
| 66 |
+
value=build_table("core"),
|
| 67 |
+
headers=table_columns,
|
| 68 |
+
interactive=False,
|
| 69 |
+
wrap=True,
|
| 70 |
+
label="Dirty Rates"
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
def update_table(source):
|
| 74 |
+
return build_table(source)
|
| 75 |
+
|
| 76 |
+
source_radio.change(
|
| 77 |
+
fn=update_table,
|
| 78 |
+
inputs=source_radio,
|
| 79 |
+
outputs=leaderboard_table
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
# Submission Tab
|
| 83 |
+
with gr.Tab(label="Submission"):
|
| 84 |
+
gr.Markdown("## Submit Your Dataset for Contamination Checking")
|
| 85 |
+
|
| 86 |
+
with gr.Row():
|
| 87 |
+
jsonl_input = gr.File(label="Upload .jsonl File", file_types=[".jsonl"])
|
| 88 |
+
hf_path_input = gr.Textbox(label="HuggingFace Dataset Path")
|
| 89 |
+
|
| 90 |
+
field_name_input = gr.Textbox(label="Context or Question Field Name", placeholder="e.g., context, question, ...")
|
| 91 |
+
|
| 92 |
+
submit_button = gr.Button("Submit for Contamination Check")
|
| 93 |
+
result_output = gr.Textbox(label="Submission Status", interactive=False)
|
| 94 |
+
|
| 95 |
+
submit_button.click(
|
| 96 |
+
fn=record_submission,
|
| 97 |
+
inputs=[jsonl_input, hf_path_input, field_name_input],
|
| 98 |
+
outputs=result_output
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
interface.launch()
|
community_results.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
|
| 3 |
+
]
|
data.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{"Benchmark": "MMLU", "Category": "Knowledge and Reasoning", "Pile Dirty": 14.57, "DCLM Dirty": 0, "CC202505 Dirty": 0},
|
| 3 |
+
{"Benchmark": "MMLU-Pro", "Category": "Knowledge and Reasoning", "Pile Dirty": 6.87, "DCLM Dirty": 6.87, "CC202505 Dirty": 6.87},
|
| 4 |
+
{"Benchmark": "Big-Bench-Hard", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.44, "DCLM Dirty": 0.44, "CC202505 Dirty": 0.44},
|
| 5 |
+
{"Benchmark": "AGIEval", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.46, "DCLM Dirty": 0.46, "CC202505 Dirty": 0.46},
|
| 6 |
+
{"Benchmark": "GPQA", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
|
| 7 |
+
|
| 8 |
+
{"Benchmark": "AIME-2024", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 10.00},
|
| 9 |
+
{"Benchmark": "GSM8K", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.38, "CC202505 Dirty": 5.76},
|
| 10 |
+
{"Benchmark": "MATH-500", "Category": "Math", "Pile Dirty": 0.60, "DCLM Dirty": 3.20, "CC202505 Dirty": 0.60},
|
| 11 |
+
{"Benchmark": "MGSM", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 5.60},
|
| 12 |
+
|
| 13 |
+
{"Benchmark": "HumanEval", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
|
| 14 |
+
{"Benchmark": "HumanEval+", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
|
| 15 |
+
{"Benchmark": "LiveCodeBench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
|
| 16 |
+
{"Benchmark": "SWE-bench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.20},
|
| 17 |
+
{"Benchmark": "MBPP", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.40, "CC202505 Dirty": 1.00},
|
| 18 |
+
|
| 19 |
+
{"Benchmark": "ARC-C", "Category": "Commonsense Understanding", "Pile Dirty": 1.79, "DCLM Dirty": 34.30, "CC202505 Dirty": 0},
|
| 20 |
+
{"Benchmark": "ARC-E", "Category": "Commonsense Understanding", "Pile Dirty": 1.64, "DCLM Dirty": 32.38, "CC202505 Dirty": 0},
|
| 21 |
+
{"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.09, "DCLM Dirty": 0.88, "CC202505 Dirty": 0},
|
| 22 |
+
{"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.01, "DCLM Dirty": 0, "CC202505 Dirty": 0},
|
| 23 |
+
{"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.80, "DCLM Dirty": 15.60, "CC202505 Dirty": 0},
|
| 24 |
+
{"Benchmark": "PIQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0},
|
| 25 |
+
{"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.10, "DCLM Dirty": 0.0, "CC202505 Dirty": 0},
|
| 26 |
+
{"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0},
|
| 27 |
+
|
| 28 |
+
{"Benchmark": "CoQA", "Category": "Reading Comprehension", "Pile Dirty": 8.20, "DCLM Dirty": 0, "CC202505 Dirty": 0},
|
| 29 |
+
{"Benchmark": "SQuAD", "Category": "Reading Comprehension", "Pile Dirty": 2.97, "DCLM Dirty": 0, "CC202505 Dirty": 0}
|
| 30 |
+
]
|
| 31 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
huggingface_hub==0.14.1
|
| 2 |
+
pandas
|