Hao Xu
commited on
Commit
Β·
7357a15
1
Parent(s):
30c3967
data update
Browse files
app.py
CHANGED
|
@@ -188,12 +188,12 @@ def record_submission(benchmark_name, contributor, jsonl_file, hf_path, hf_split
|
|
| 188 |
'field_name': field_name.strip(),
|
| 189 |
'hf_path': hf_path.strip() if has_hf else ''
|
| 190 |
}
|
| 191 |
-
|
| 192 |
files = {}
|
| 193 |
if has_jsonl:
|
| 194 |
files['file'] = (benchmark_name.strip() + '.jsonl', open(jsonl_file.name, 'rb'), 'application/json')
|
| 195 |
|
| 196 |
-
response = requests.post(URL
|
| 197 |
|
| 198 |
if files:
|
| 199 |
files['file'][1].close()
|
|
@@ -204,13 +204,13 @@ def record_submission(benchmark_name, contributor, jsonl_file, hf_path, hf_split
|
|
| 204 |
message = result.get('message', 'Submission successful!')
|
| 205 |
|
| 206 |
full_message = f"{message}\n\n" \
|
| 207 |
-
f"
|
| 208 |
-
f"
|
| 209 |
-
f"
|
| 210 |
|
| 211 |
return full_message
|
| 212 |
elif result.get("status") == "info":
|
| 213 |
-
return f"
|
| 214 |
else:
|
| 215 |
return f"β {result.get('message', 'Unknown error occurred')}"
|
| 216 |
else:
|
|
@@ -258,9 +258,9 @@ with gr.Blocks() as interface:
|
|
| 258 |
|
| 259 |
with gr.Row():
|
| 260 |
jsonl_input = gr.File(label="Upload .jsonl File", file_types=[".jsonl"])
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
field_name_input = gr.Textbox(label="Context or Question Field Name", placeholder="e.g., context, question, ...")
|
| 265 |
|
| 266 |
submit_button = gr.Button("Submit for Contamination Check")
|
|
|
|
| 188 |
'field_name': field_name.strip(),
|
| 189 |
'hf_path': hf_path.strip() if has_hf else ''
|
| 190 |
}
|
| 191 |
+
print(json.dumps(data))
|
| 192 |
files = {}
|
| 193 |
if has_jsonl:
|
| 194 |
files['file'] = (benchmark_name.strip() + '.jsonl', open(jsonl_file.name, 'rb'), 'application/json')
|
| 195 |
|
| 196 |
+
response = requests.post(f"{URL}/submit", data=data, files=files, timeout=30)
|
| 197 |
|
| 198 |
if files:
|
| 199 |
files['file'][1].close()
|
|
|
|
| 204 |
message = result.get('message', 'Submission successful!')
|
| 205 |
|
| 206 |
full_message = f"{message}\n\n" \
|
| 207 |
+
f"β
Your submission has been saved and will be processed automatically.\n" \
|
| 208 |
+
f"Results will appear in the main leaderboard when ready.\n" \
|
| 209 |
+
f"You can refresh the leaderboard to check for updates."
|
| 210 |
|
| 211 |
return full_message
|
| 212 |
elif result.get("status") == "info":
|
| 213 |
+
return f"β {result.get('message', 'Submission already exists')}"
|
| 214 |
else:
|
| 215 |
return f"β {result.get('message', 'Unknown error occurred')}"
|
| 216 |
else:
|
|
|
|
| 258 |
|
| 259 |
with gr.Row():
|
| 260 |
jsonl_input = gr.File(label="Upload .jsonl File", file_types=[".jsonl"])
|
| 261 |
+
with gr.Column():
|
| 262 |
+
hf_path_input = gr.Textbox(label="HuggingFace Dataset Path")
|
| 263 |
+
hf_split_input = gr.Textbox(label="Dataset split (only if providing HuggingFace Dataset)", placeholder="e.g., validation, test")
|
| 264 |
field_name_input = gr.Textbox(label="Context or Question Field Name", placeholder="e.g., context, question, ...")
|
| 265 |
|
| 266 |
submit_button = gr.Button("Submit for Contamination Check")
|
data.json
CHANGED
|
@@ -1,31 +1,31 @@
|
|
| 1 |
[
|
| 2 |
-
{"Benchmark": "MMLU", "Category": "Knowledge and Reasoning", "Pile Dirty": 13.
|
| 3 |
-
{"Benchmark": "MMLU-
|
| 4 |
-
{"Benchmark": "BBH", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.
|
| 5 |
-
{"Benchmark": "AGIEval", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.
|
| 6 |
-
{"Benchmark": "GPQA", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.
|
| 7 |
-
{"Benchmark": "HLE", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.
|
| 8 |
-
|
| 9 |
-
{"Benchmark": "AIME_2024", "Category": "Math", "Pile Dirty": 0.
|
| 10 |
-
{"Benchmark": "GSM8K", "Category": "Math", "Pile Dirty": 0.
|
| 11 |
-
{"Benchmark": "MATH-500", "Category": "Math", "Pile Dirty": 0.
|
| 12 |
-
{"Benchmark": "MGSM", "Category": "Math", "Pile Dirty": 0.
|
| 13 |
-
|
| 14 |
-
{"Benchmark": "HumanEval", "Category": "Code", "Pile Dirty": 0.
|
| 15 |
-
{"Benchmark": "HumanEval+", "Category": "Code", "Pile Dirty": 0.
|
| 16 |
-
{"Benchmark": "LiveCodeBench", "Category": "Code", "Pile Dirty": 0.
|
| 17 |
-
{"Benchmark": "SWE-bench", "Category": "Code", "Pile Dirty": 0.
|
| 18 |
-
{"Benchmark": "MBPP", "Category": "Code", "Pile Dirty": 0.
|
| 19 |
-
|
| 20 |
-
{"Benchmark": "ARC-
|
| 21 |
-
{"Benchmark": "ARC-
|
| 22 |
-
{"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.
|
| 23 |
-
{"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.
|
| 24 |
-
{"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.
|
| 25 |
-
{"Benchmark": "PIQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.
|
| 26 |
-
{"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.
|
| 27 |
-
{"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.
|
| 28 |
-
|
| 29 |
-
{"Benchmark": "CoQA", "Category": "Reading Comprehension", "Pile Dirty": 8.
|
| 30 |
-
{"Benchmark": "SQuAD", "Category": "Reading Comprehension", "Pile Dirty": 2.
|
| 31 |
]
|
|
|
|
| 1 |
[
|
| 2 |
+
{"Benchmark": "MMLU", "Category": "Knowledge and Reasoning", "Pile Dirty": 13.2, "DCLM Dirty": 28.4, "CC202505 Dirty": 13.5, "URL": "https://huggingface.co/datasets/cais/mmlu"},
|
| 3 |
+
{"Benchmark": "MMLU-Pro", "Category": "Knowledge and Reasoning", "Pile Dirty": 5.5, "DCLM Dirty": 16.2, "CC202505 Dirty": 7.1, "URL": "https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro"},
|
| 4 |
+
{"Benchmark": "BBH", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.1, "CC202505 Dirty": 1.4, "URL": "https://github.com/suzgunmirac/BIG-Bench-Hard/tree/main/bbh"},
|
| 5 |
+
{"Benchmark": "AGIEval", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.8, "DCLM Dirty": 3.1, "CC202505 Dirty": 2.7, "URL": "https://github.com/ruixiangcui/AGIEval/tree/main/data/v1_1"},
|
| 6 |
+
{"Benchmark": "GPQA", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.89, "URL": "https://huggingface.co/datasets/Idavidrein/gpqa"},
|
| 7 |
+
{"Benchmark": "HLE", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.3, "CC202505 Dirty": 0.1, "URL": "https://huggingface.co/datasets/cais/hle"},
|
| 8 |
+
|
| 9 |
+
{"Benchmark": "AIME_2024", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 10.0, "URL": "https://huggingface.co/datasets/Maxwell-Jia/AIME_2024"},
|
| 10 |
+
{"Benchmark": "GSM8K", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.4, "CC202505 Dirty": 5.0, "URL": "https://huggingface.co/datasets/openai/gsm8k"},
|
| 11 |
+
{"Benchmark": "MATH-500", "Category": "Math", "Pile Dirty": 0.6, "DCLM Dirty": 3.2, "CC202505 Dirty": 0.6, "URL": "https://huggingface.co/datasets/HuggingFaceH4/MATH-500"},
|
| 12 |
+
{"Benchmark": "MGSM", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 5.6, "URL": "https://huggingface.co/datasets/juletxara/mgsm"},
|
| 13 |
+
|
| 14 |
+
{"Benchmark": "HumanEval", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/openai/openai_humaneval"},
|
| 15 |
+
{"Benchmark": "HumanEval+", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/evalplus/humanevalplus"},
|
| 16 |
+
{"Benchmark": "LiveCodeBench", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/livecodebench/code_generation"},
|
| 17 |
+
{"Benchmark": "SWE-bench", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.2, "URL": "https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified"},
|
| 18 |
+
{"Benchmark": "MBPP", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.4, "CC202505 Dirty": 1.0, "URL": "https://huggingface.co/datasets/google-research-datasets/mbpp"},
|
| 19 |
+
|
| 20 |
+
{"Benchmark": "ARC-Challenge", "Category": "Commonsense Understanding", "Pile Dirty": 1.8, "DCLM Dirty": 34.1, "CC202505 Dirty": 11.9, "URL": "https://huggingface.co/datasets/allenai/ai2_arc"},
|
| 21 |
+
{"Benchmark": "ARC-Easy", "Category": "Commonsense Understanding", "Pile Dirty": 1.3, "DCLM Dirty": 31.7, "CC202505 Dirty": 5.4, "URL": "https://huggingface.co/datasets/allenai/ai2_arc"},
|
| 22 |
+
{"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.1, "DCLM Dirty": 1.0, "CC202505 Dirty": 0.1, "URL": "https://huggingface.co/datasets/tau/commonsense_qa"},
|
| 23 |
+
{"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/Rowan/hellaswag"},
|
| 24 |
+
{"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.8, "DCLM Dirty": 15.6, "CC202505 Dirty": 14.6, "URL": "https://huggingface.co/datasets/allenai/openbookqa"},
|
| 25 |
+
{"Benchmark": "PIQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": ""},
|
| 26 |
+
{"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.5, "CC202505 Dirty": 0.2, "URL": "https://huggingface.co/datasets/allenai/social_i_qa"},
|
| 27 |
+
{"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/allenai/winogrande"},
|
| 28 |
+
|
| 29 |
+
{"Benchmark": "CoQA", "Category": "Reading Comprehension", "Pile Dirty": 8.0, "DCLM Dirty": 18.4, "CC202505 Dirty": 7.4, "URL": "https://huggingface.co/datasets/stanfordnlp/coqa"},
|
| 30 |
+
{"Benchmark": "SQuAD", "Category": "Reading Comprehension", "Pile Dirty": 2.8, "DCLM Dirty": 40.1, "CC202505 Dirty": 2.7, "URL": "https://huggingface.co/datasets/rajpurkar/squad"}
|
| 31 |
]
|