Spaces:

infini-gram-mini
/

Benchmark-Contamination-Monitoring-System

Running

App Files Files Community

Hao Xu commited on May 28

Commit

7357a15

1 Parent(s): 30c3967

data update

Browse files

Files changed (2) hide show

app.py +9 -9
data.json +29 -29

app.py CHANGED Viewed

@@ -188,12 +188,12 @@ def record_submission(benchmark_name, contributor, jsonl_file, hf_path, hf_split
             'field_name': field_name.strip(),
             'hf_path': hf_path.strip() if has_hf else ''
         }
         files = {}
         if has_jsonl:
             files['file'] = (benchmark_name.strip() + '.jsonl', open(jsonl_file.name, 'rb'), 'application/json')
-        response = requests.post(URL + "/submit", data=data, files=files)
         if files:
             files['file'][1].close()
@@ -204,13 +204,13 @@ def record_submission(benchmark_name, contributor, jsonl_file, hf_path, hf_split
                 message = result.get('message', 'Submission successful!')
                 full_message = f"{message}\n\n" \
-                              f"📋 Your submission has been saved and will be processed automatically.\n" \
-                              f"💡 Results will appear in the main leaderboard when ready.\n" \
-                              f"🔄 You can refresh the leaderboard to check for updates."
                 return full_message
             elif result.get("status") == "info":
-                return f"ℹ️ {result.get('message', 'Submission already exists')}"
             else:
                 return f"❌ {result.get('message', 'Unknown error occurred')}"
         else:
@@ -258,9 +258,9 @@ with gr.Blocks() as interface:
             with gr.Row():
                 jsonl_input = gr.File(label="Upload .jsonl File", file_types=[".jsonl"])
-                hf_path_input = gr.Textbox(label="HuggingFace Dataset Path")
-            hf_split_input = gr.Textbox(label="Dataset split (only if providing HuggingFace Dataset)", placeholder="e.g., validation, test")
             field_name_input = gr.Textbox(label="Context or Question Field Name", placeholder="e.g., context, question, ...")
             submit_button = gr.Button("Submit for Contamination Check")

             'field_name': field_name.strip(),
             'hf_path': hf_path.strip() if has_hf else ''
         }
+        print(json.dumps(data))
         files = {}
         if has_jsonl:
             files['file'] = (benchmark_name.strip() + '.jsonl', open(jsonl_file.name, 'rb'), 'application/json')
+        response = requests.post(f"{URL}/submit", data=data, files=files, timeout=30)
         if files:
             files['file'][1].close()
                 message = result.get('message', 'Submission successful!')
                 full_message = f"{message}\n\n" \
+                              f"✅ Your submission has been saved and will be processed automatically.\n" \
+                              f"Results will appear in the main leaderboard when ready.\n" \
+                              f"You can refresh the leaderboard to check for updates."
                 return full_message
             elif result.get("status") == "info":
+                return f"❌ {result.get('message', 'Submission already exists')}"
             else:
                 return f"❌ {result.get('message', 'Unknown error occurred')}"
         else:
             with gr.Row():
                 jsonl_input = gr.File(label="Upload .jsonl File", file_types=[".jsonl"])
+                with gr.Column():
+                    hf_path_input = gr.Textbox(label="HuggingFace Dataset Path")
+                    hf_split_input = gr.Textbox(label="Dataset split (only if providing HuggingFace Dataset)", placeholder="e.g., validation, test")
             field_name_input = gr.Textbox(label="Context or Question Field Name", placeholder="e.g., context, question, ...")
             submit_button = gr.Button("Submit for Contamination Check")

data.json CHANGED Viewed

@@ -1,31 +1,31 @@
 [
-  {"Benchmark": "MMLU", "Category": "Knowledge and Reasoning", "Pile Dirty": 13.20, "DCLM Dirty": 28.40, "CC202505 Dirty": 13.50},
-  {"Benchmark": "MMLU-pro", "Category": "Knowledge and Reasoning", "Pile Dirty": 5.50, "DCLM Dirty": 16.20, "CC202505 Dirty": 7.10},
-  {"Benchmark": "BBH", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.00, "DCLM Dirty": 0.10, "CC202505 Dirty": 1.40},
-  {"Benchmark": "AGIEval", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.80, "DCLM Dirty": 3.10, "CC202505 Dirty": 2.70},
-  {"Benchmark": "GPQA", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.89},
-  {"Benchmark": "HLE", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.00, "DCLM Dirty": 0.30, "CC202505 Dirty": 0.10},
-  {"Benchmark": "AIME_2024", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 10.00},
-  {"Benchmark": "GSM8K", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.40, "CC202505 Dirty": 5.00},
-  {"Benchmark": "MATH-500", "Category": "Math", "Pile Dirty": 0.60, "DCLM Dirty": 3.20, "CC202505 Dirty": 0.60},
-  {"Benchmark": "MGSM", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 5.60},
-  {"Benchmark": "HumanEval", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
-  {"Benchmark": "HumanEval+", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
-  {"Benchmark": "LiveCodeBench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
-  {"Benchmark": "SWE-bench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.20},
-  {"Benchmark": "MBPP", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.40, "CC202505 Dirty": 1.00},
-  {"Benchmark": "ARC-C", "Category": "Commonsense Understanding", "Pile Dirty": 1.80, "DCLM Dirty": 34.10, "CC202505 Dirty": 11.90},
-  {"Benchmark": "ARC-E", "Category": "Commonsense Understanding", "Pile Dirty": 1.30, "DCLM Dirty": 31.70, "CC202505 Dirty": 5.40},
-  {"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.10, "DCLM Dirty": 1.00, "CC202505 Dirty": 0.10},
-  {"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
-  {"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.80, "DCLM Dirty": 15.60, "CC202505 Dirty": 14.60},
-  {"Benchmark": "PIQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
-  {"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": 0.50, "CC202505 Dirty": 0.20},
-  {"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
-  {"Benchmark": "CoQA", "Category": "Reading Comprehension", "Pile Dirty": 8.00, "DCLM Dirty": 18.40, "CC202505 Dirty": 7.40},
-  {"Benchmark": "SQuAD", "Category": "Reading Comprehension", "Pile Dirty": 2.80, "DCLM Dirty": 40.10, "CC202505 Dirty": 2.70}
 ]

 [
+  {"Benchmark": "MMLU", "Category": "Knowledge and Reasoning", "Pile Dirty": 13.2, "DCLM Dirty": 28.4, "CC202505 Dirty": 13.5, "URL": "https://huggingface.co/datasets/cais/mmlu"},
+  {"Benchmark": "MMLU-Pro", "Category": "Knowledge and Reasoning", "Pile Dirty": 5.5, "DCLM Dirty": 16.2, "CC202505 Dirty": 7.1, "URL": "https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro"},
+  {"Benchmark": "BBH", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.1, "CC202505 Dirty": 1.4, "URL": "https://github.com/suzgunmirac/BIG-Bench-Hard/tree/main/bbh"},
+  {"Benchmark": "AGIEval", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.8, "DCLM Dirty": 3.1, "CC202505 Dirty": 2.7, "URL": "https://github.com/ruixiangcui/AGIEval/tree/main/data/v1_1"},
+  {"Benchmark": "GPQA", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.89, "URL": "https://huggingface.co/datasets/Idavidrein/gpqa"},
+  {"Benchmark": "HLE", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.3, "CC202505 Dirty": 0.1, "URL": "https://huggingface.co/datasets/cais/hle"},
+  {"Benchmark": "AIME_2024", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 10.0, "URL": "https://huggingface.co/datasets/Maxwell-Jia/AIME_2024"},
+  {"Benchmark": "GSM8K", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.4, "CC202505 Dirty": 5.0, "URL": "https://huggingface.co/datasets/openai/gsm8k"},
+  {"Benchmark": "MATH-500", "Category": "Math", "Pile Dirty": 0.6, "DCLM Dirty": 3.2, "CC202505 Dirty": 0.6, "URL": "https://huggingface.co/datasets/HuggingFaceH4/MATH-500"},
+  {"Benchmark": "MGSM", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 5.6, "URL": "https://huggingface.co/datasets/juletxara/mgsm"},
+  {"Benchmark": "HumanEval", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/openai/openai_humaneval"},
+  {"Benchmark": "HumanEval+", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/evalplus/humanevalplus"},
+  {"Benchmark": "LiveCodeBench", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/livecodebench/code_generation"},
+  {"Benchmark": "SWE-bench", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.2, "URL": "https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified"},
+  {"Benchmark": "MBPP", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.4, "CC202505 Dirty": 1.0, "URL": "https://huggingface.co/datasets/google-research-datasets/mbpp"},
+  {"Benchmark": "ARC-Challenge", "Category": "Commonsense Understanding", "Pile Dirty": 1.8, "DCLM Dirty": 34.1, "CC202505 Dirty": 11.9, "URL": "https://huggingface.co/datasets/allenai/ai2_arc"},
+  {"Benchmark": "ARC-Easy", "Category": "Commonsense Understanding", "Pile Dirty": 1.3, "DCLM Dirty": 31.7, "CC202505 Dirty": 5.4, "URL": "https://huggingface.co/datasets/allenai/ai2_arc"},
+  {"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.1, "DCLM Dirty": 1.0, "CC202505 Dirty": 0.1, "URL": "https://huggingface.co/datasets/tau/commonsense_qa"},
+  {"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/Rowan/hellaswag"},
+  {"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.8, "DCLM Dirty": 15.6, "CC202505 Dirty": 14.6, "URL": "https://huggingface.co/datasets/allenai/openbookqa"},
+  {"Benchmark": "PIQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": ""},
+  {"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.5, "CC202505 Dirty": 0.2, "URL": "https://huggingface.co/datasets/allenai/social_i_qa"},
+  {"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/allenai/winogrande"},
+  {"Benchmark": "CoQA", "Category": "Reading Comprehension", "Pile Dirty": 8.0, "DCLM Dirty": 18.4, "CC202505 Dirty": 7.4, "URL": "https://huggingface.co/datasets/stanfordnlp/coqa"},
+  {"Benchmark": "SQuAD", "Category": "Reading Comprehension", "Pile Dirty": 2.8, "DCLM Dirty": 40.1, "CC202505 Dirty": 2.7, "URL": "https://huggingface.co/datasets/rajpurkar/squad"}
 ]