Spaces:

infini-gram-mini
/

Benchmark-Contamination-Monitoring-System

Running

App Files Files Community

Hao Xu commited on Apr 14

Commit

3c856c0

1 Parent(s): b58437b

leaderboard UI

Browse files

Files changed (4) hide show

app.py +101 -0
community_results.json +3 -0
data.json +31 -0
requirements.txt +2 -0

app.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import gradio as gr
+import os
+import json
+import pandas as pd
+def load_data(source):
+    data = []
+    if source == "core":
+        with open("data.json", "r") as file:
+            data = json.load(file)
+    elif source == "community":
+        with open("community_results.json", "r") as file:
+            data = json.load(file)
+    return data
+def build_table(source):
+    data = load_data(source)
+    entries = []
+    for entry in data:
+        entries.append({
+            "Benchmark": entry.get("Benchmark", ""),
+            "Category": entry.get("Category", ""),
+            "Pile Dirty (%)": entry.get("Pile Dirty", ""),
+            "DCLM Dirty (%)": entry.get("DCLM Dirty", ""),
+            "CC Dirty (%)": entry.get("CC202505 Dirty", ""),
+        })
+    return pd.DataFrame(entries).sort_values(by="Pile Dirty (%)", ascending=False)
+def record_submission(jsonl_file, hf_path, field_name):
+    if jsonl_file is None and not hf_path:
+        return "Please provide either a .jsonl file or a HuggingFace dataset path."
+    entry = {
+        "source": hf_path if hf_path else jsonl_file.name,
+        "type": "hf" if hf_path else "jsonl",
+        "field_name": field_name,
+    }
+    queue_file = "pending_submissions.json"
+    existing = []
+    if os.path.exists(queue_file):
+        with open(queue_file, "r") as f:
+            existing = json.load(f)
+    existing.append(entry)
+    with open(queue_file, "w") as f:
+        json.dump(existing, f, indent=2)
+    return "✅ Submission received! You'll be notified when processing is complete."
+with gr.Blocks() as interface:
+    gr.Markdown("# 📖 Benchmark Contamination Bulletin")
+    with gr.Tabs():
+        with gr.Tab(label="Leaderboard"):
+            source_radio = gr.Radio(
+                choices=["core", "community"],
+                label="Select Benchmark Source",
+                value="core"
+            )
+            table_columns = ["Benchmark", "Category", "Pile Dirty (%)", "DCLM Dirty (%)", "CC202505 Dirty (%)"]
+            leaderboard_table = gr.Dataframe(
+                value=build_table("core"),
+                headers=table_columns,
+                interactive=False,
+                wrap=True,
+                label="Dirty Rates"
+            )
+            def update_table(source):
+                return build_table(source)
+            source_radio.change(
+                fn=update_table,
+                inputs=source_radio,
+                outputs=leaderboard_table
+            )
+        # Submission Tab
+        with gr.Tab(label="Submission"):
+            gr.Markdown("## Submit Your Dataset for Contamination Checking")
+            with gr.Row():
+                jsonl_input = gr.File(label="Upload .jsonl File", file_types=[".jsonl"])
+                hf_path_input = gr.Textbox(label="HuggingFace Dataset Path")
+            field_name_input = gr.Textbox(label="Context or Question Field Name", placeholder="e.g., context, question, ...")
+            submit_button = gr.Button("Submit for Contamination Check")
+            result_output = gr.Textbox(label="Submission Status", interactive=False)
+            submit_button.click(
+                fn=record_submission,
+                inputs=[jsonl_input, hf_path_input, field_name_input],
+                outputs=result_output
+            )
+interface.launch()

community_results.json ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ [
2	+
3	+ ]

data.json ADDED Viewed

	@@ -0,0 +1,31 @@

+[
+    {"Benchmark": "MMLU", "Category": "Knowledge and Reasoning", "Pile Dirty": 14.57, "DCLM Dirty": 0, "CC202505 Dirty": 0},
+    {"Benchmark": "MMLU-Pro", "Category": "Knowledge and Reasoning", "Pile Dirty": 6.87, "DCLM Dirty": 6.87, "CC202505 Dirty": 6.87},
+    {"Benchmark": "Big-Bench-Hard", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.44, "DCLM Dirty": 0.44, "CC202505 Dirty": 0.44},
+    {"Benchmark": "AGIEval", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.46, "DCLM Dirty": 0.46, "CC202505 Dirty": 0.46},
+    {"Benchmark": "GPQA", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
+    {"Benchmark": "AIME-2024", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 10.00},
+    {"Benchmark": "GSM8K", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.38, "CC202505 Dirty": 5.76},
+    {"Benchmark": "MATH-500", "Category": "Math", "Pile Dirty": 0.60, "DCLM Dirty": 3.20, "CC202505 Dirty": 0.60},
+    {"Benchmark": "MGSM", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 5.60},
+    {"Benchmark": "HumanEval", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
+    {"Benchmark": "HumanEval+", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
+    {"Benchmark": "LiveCodeBench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
+    {"Benchmark": "SWE-bench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.20},
+    {"Benchmark": "MBPP", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.40, "CC202505 Dirty": 1.00},
+    {"Benchmark": "ARC-C", "Category": "Commonsense Understanding", "Pile Dirty": 1.79, "DCLM Dirty": 34.30, "CC202505 Dirty": 0},
+    {"Benchmark": "ARC-E", "Category": "Commonsense Understanding", "Pile Dirty": 1.64, "DCLM Dirty": 32.38, "CC202505 Dirty": 0},
+    {"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.09, "DCLM Dirty": 0.88, "CC202505 Dirty": 0},
+    {"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.01, "DCLM Dirty": 0, "CC202505 Dirty": 0},
+    {"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.80, "DCLM Dirty": 15.60, "CC202505 Dirty": 0},
+    {"Benchmark": "PIQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0},
+    {"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.10, "DCLM Dirty": 0.0, "CC202505 Dirty": 0},
+    {"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0},
+    {"Benchmark": "CoQA", "Category": "Reading Comprehension", "Pile Dirty": 8.20, "DCLM Dirty": 0, "CC202505 Dirty": 0},
+    {"Benchmark": "SQuAD", "Category": "Reading Comprehension", "Pile Dirty": 2.97, "DCLM Dirty": 0, "CC202505 Dirty": 0}
+  ]

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ huggingface_hub==0.14.1
2	+ pandas