| import gradio as gr | |
| import os | |
| import json | |
| import re | |
| from huggingface_hub import HfApi | |
| from datasets import load_dataset | |
| import requests | |
| import datetime | |
| TOKEN = os.environ.get("HF_TOKEN") | |
| OWNER = os.environ.get("OWNER") | |
| RESULTS_CORE = f"{OWNER}/core_benchmark_results" | |
| RESULTS_COMMUNITY = f"{OWNER}/benchmark_results" | |
| api = HfApi() | |
| URL = os.environ.get("URL") | |
| def load_data(source, refresh=False): | |
| if source == "core": | |
| if refresh: | |
| ds = load_dataset(RESULTS_CORE, download_mode="force_redownload") | |
| else: | |
| ds = load_dataset(RESULTS_CORE) | |
| data = [] | |
| for entry in ds['train']: | |
| data.append(entry) | |
| else: | |
| if refresh: | |
| ds = load_dataset(RESULTS_COMMUNITY, download_mode="force_redownload") | |
| else: | |
| ds = load_dataset(RESULTS_COMMUNITY) | |
| data = [] | |
| for entry in ds['train']: | |
| data.append(entry) | |
| return data | |
| def build_table(source, refresh=False): | |
| data = load_data(source, refresh) | |
| if source == "core": | |
| headers = ["Benchmark", "Category", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)"] | |
| else: | |
| headers = ["Benchmark", "Contributor", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)"] | |
| cc_columns = set() | |
| for entry in data: | |
| for key in entry.keys(): | |
| match = re.match(r'CC(\d{4})(\d{2}) Dirty', key) | |
| if match: | |
| year, crawl = match.groups() | |
| formatted_key = f"CC-{year}-{crawl} Dirty (%)" | |
| cc_columns.add((year, crawl, formatted_key)) | |
| for year, crawl, formatted_key in sorted(cc_columns): | |
| headers.append(formatted_key) | |
| html = """ | |
| <table id="benchmarkTable" style="border-collapse: collapse; width: 100%;"> | |
| <thead> | |
| <tr> | |
| """ | |
| for col in headers: | |
| html += f''' | |
| <th onclick="sortTable(this)" style="cursor: pointer; border: 1px solid #ddd; padding: 8px; text-align: right;"> | |
| {col} | |
| <span class="tri-container"> | |
| <span class="triangle-up"></span> | |
| <span class="triangle-down"></span> | |
| </span> | |
| </th> | |
| ''' | |
| html += "</tr></thead><tbody>" | |
| for entry in data: | |
| name = entry.get("Benchmark", "") | |
| url = entry.get("URL", "#") | |
| hyperlink = f'<a href="{url}" target="_blank">{name}</a>' if url else name | |
| row = { | |
| "Benchmark": hyperlink, | |
| "Pile-train Dirty (%)": entry.get("Pile Dirty", -1), | |
| "DCLM-baseline Dirty (%)": entry.get("DCLM Dirty", -1) | |
| } | |
| for key, value in entry.items(): | |
| match = re.match(r'CC(\d{4})(\d{2}) Dirty', key) | |
| if match: | |
| year, crawl = match.groups() | |
| formatted_key = f"CC-{year}-{crawl} Dirty (%)" | |
| row[formatted_key] = value | |
| if source == "core": | |
| row["Category"] = entry.get("Category", "") | |
| elif source == "community": | |
| row["Contributor"] = entry.get("Contributor", "") | |
| html += "<tr>" | |
| for col in headers: | |
| val = row.get(col, "") | |
| if isinstance(val, float) and val >= 0: | |
| val_display = f"{val:5.1f}" | |
| html += f'<td style="border: 1px solid #ddd; padding: 8px; text-align: right;">{val_display}</td>' | |
| elif isinstance(val, float): | |
| html += f'<td style="border: 1px solid #ddd; padding: 8px; text-align: right;">N/A</td>' | |
| else: | |
| html += f'<td style="border: 1px solid #ddd; padding: 8px; text-align: right;">{val}</td>' | |
| html += "</tr>\n" | |
| html += "</tbody></table>" | |
| html += """ | |
| <script> | |
| let sortDirection = {}; | |
| function sortTable(header) { | |
| const table = document.getElementById("benchmarkTable"); | |
| const rows = Array.from(table.tBodies[0].rows); | |
| const columnIndex = Array.from(header.parentNode.children).indexOf(header); | |
| const isAscending = sortDirection[columnIndex] === 'ascending'; | |
| sortDirection[columnIndex] = isAscending ? 'descending' : 'ascending'; | |
| Array.from(header.parentNode.children).forEach(th => { | |
| const up = th.querySelector('.triangle-up'); | |
| const down = th.querySelector('.triangle-down'); | |
| if (up) up.classList.remove('active'); | |
| if (down) down.classList.remove('active'); | |
| }); | |
| if (sortDirection[columnIndex] === 'ascending') { | |
| header.querySelector('.triangle-up').classList.add('active'); | |
| } else { | |
| header.querySelector('.triangle-down').classList.add('active'); | |
| } | |
| rows.sort((rowA, rowB) => { | |
| const cellA = rowA.cells[columnIndex].innerText; | |
| const cellB = rowB.cells[columnIndex].innerText; | |
| if (isNaN(cellA)) { | |
| return isAscending ? cellA.localeCompare(cellB) : cellB.localeCompare(cellA); | |
| } | |
| return isAscending ? parseFloat(cellA) - parseFloat(cellB) : parseFloat(cellB) - parseFloat(cellA); | |
| }); | |
| rows.forEach(row => table.tBodies[0].appendChild(row)); | |
| } | |
| </script> | |
| """ | |
| html += """ | |
| <style> | |
| thead tr { | |
| background-color: #f0f0f0; | |
| } | |
| .tri-container { | |
| display: inline-block; | |
| margin-left: 4px; | |
| vertical-align: middle; | |
| } | |
| .triangle-up, .triangle-down { | |
| display: block; | |
| width: 0; | |
| height: 0; | |
| margin: 1px auto; | |
| border-left: 5px solid transparent; | |
| border-right: 5px solid transparent; | |
| } | |
| .triangle-up { | |
| border-bottom: 5px solid #999; | |
| } | |
| .triangle-down { | |
| border-top: 5px solid #999; | |
| } | |
| .triangle-up.active { | |
| border-bottom: 5px solid #000; | |
| } | |
| .triangle-down.active { | |
| border-top: 5px solid #000; | |
| } | |
| </style> | |
| """ | |
| return html | |
| def record_submission(benchmark_name, contributor, jsonl_file, hf_path, hf_split, field_name, hf_config, profile: gr.OAuthProfile): | |
| user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview") | |
| creation_date = json.loads(user_data.content)["createdAt"] | |
| if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=10): | |
| return "β This account is not authorized to submit." | |
| if not benchmark_name or not benchmark_name.strip(): | |
| return "β Please provide a benchmark name." | |
| if not field_name or not field_name.strip(): | |
| return "β Please provide a field name." | |
| has_jsonl = jsonl_file is not None | |
| has_hf = hf_path and hf_path.strip() | |
| if not has_jsonl and not has_hf: | |
| return "β Please provide either a .jsonl file or a HuggingFace dataset path." | |
| if has_jsonl: | |
| try: | |
| with open(jsonl_file.name, 'r', encoding='utf-8') as f: | |
| line_count = 0 | |
| for line in f: | |
| line_count += 1 | |
| if line_count > 5: | |
| break | |
| try: | |
| entry = json.loads(line.strip()) | |
| if field_name.strip() not in entry: | |
| available_fields = list(entry.keys()) | |
| return f"β Field '{field_name.strip()}' not found in JSONL file. Available fields: {', '.join(available_fields)}" | |
| except json.JSONDecodeError as e: | |
| return f"β Invalid JSON format in line {line_count}: {str(e)}" | |
| if line_count == 0: | |
| return "β The uploaded file is empty." | |
| except Exception as e: | |
| return f"β Error reading file: {str(e)}" | |
| elif has_hf: | |
| if not hf_split or not hf_split.strip(): | |
| return "β Please provide a dataset split for the HuggingFace dataset." | |
| try: | |
| if hf_config: | |
| dataset_info = load_dataset(hf_path.strip(), hf_config.strip(), split=hf_split.strip(), streaming=True, trust_remote_code=True) | |
| else: | |
| dataset_info = load_dataset(hf_path.strip(), split=hf_split.strip(), streaming=True, trust_remote_code=True) | |
| first_item = next(iter(dataset_info)) | |
| if field_name.strip() not in first_item: | |
| available_fields = list(first_item.keys()) | |
| return f"β Field '{field_name.strip()}' not found in dataset. Available fields: {', '.join(available_fields)}" | |
| except Exception as e: | |
| return f"β Could not access HuggingFace dataset: {str(e)}" | |
| try: | |
| data = { | |
| 'name': benchmark_name.strip(), | |
| 'contributor': contributor.strip(), | |
| 'type': 'jsonl' if has_jsonl else 'hf', | |
| 'split': hf_split.strip() if has_hf else '', | |
| 'field_name': field_name.strip(), | |
| 'hf_path': hf_path.strip() if has_hf else '', | |
| 'hf_config': hf_config.strip() if has_hf else '' | |
| } | |
| print(json.dumps(data)) | |
| files = {} | |
| if has_jsonl: | |
| files['file'] = (benchmark_name.strip() + '.jsonl', open(jsonl_file.name, 'rb'), 'application/json') | |
| response = requests.post(f"{URL}/", data={"payload": json.dumps(data)}, files=files, timeout=30) | |
| if files: | |
| files['file'][1].close() | |
| if response.status_code == 200: | |
| result = response.json() | |
| if result.get("status") == "success": | |
| message = result.get('message', 'Submission successful!') | |
| full_message = f"{message}" | |
| return full_message | |
| elif result.get("status") == "info": | |
| return f"β {result.get('message', 'Submission already exists')}" | |
| else: | |
| return f"β {result.get('message', 'Unknown error occurred')}" | |
| else: | |
| return f"β Server error: {response.status_code} - {response.text}" | |
| except Exception as e: | |
| return f"β Error submitting benchmark: {str(e)}" | |
| with gr.Blocks() as interface: | |
| gr.HTML( | |
| '''<h1 text-align="center">π Benchmark Contamination Monitoring System</h1> | |
| <p style='font-size: 16px;'>This system monitors potential contamination in benchmark datasets used for evaluating language models across various open-source corpora π§.</p> | |
| <p style='font-size: 16px;'>The system is released along with our paper <a href="https://arxiv.org/abs/2506.12229">Infini-gram mini: Exact n-gram Search at the Internet Scale with FM-Index</a>, which documents the methodology and findings in detail.</p> | |
| <p style='font-size: 16px;'>We welcome the community to submit new benchmarks for contamination analysis using the <b>"Add New Benchmarks"</b> tab.</p> | |
| ''' | |
| ) | |
| with gr.Tabs(): | |
| with gr.Tab(label="Bulletin"): | |
| gr.Markdown("## Benchmark Contamination Bulletin") | |
| with gr.Accordion(label='Click to view instructions', open=False): | |
| gr.Markdown(''' | |
| The **Benchmark Contamination Bulletin** presents contamination statistics for evaluation benchmarks across different data sources. | |
| - Benchmarks analyzed in our paper are under the **core** source. Community-submitted benchmarks appear under the **community** source. | |
| - The contamination rate represents the percentage of *dirty* benchmark entries. | |
| - The bulletin will be updated regularly to include contamination checks on newly released Common Crawl dumps. | |
| ''') | |
| source_radio = gr.Radio( | |
| choices=["core", "community"], | |
| label="Select Benchmark Source", | |
| value="core" | |
| ) | |
| leaderboard_html = gr.HTML(build_table("core", refresh=False)) | |
| def update_table(source): | |
| return build_table(source, refresh=True) | |
| source_radio.change( | |
| fn=build_table, | |
| inputs=source_radio, | |
| outputs=leaderboard_html | |
| ) | |
| refresh_button = gr.Button("Refresh") | |
| refresh_button.click( | |
| fn=update_table, | |
| inputs=source_radio, | |
| outputs=leaderboard_html | |
| ) | |
| with gr.Tab(label="Add New Benchmarks"): | |
| gr.Markdown(''' | |
| ## Add Your Own Benchmarks for Contamination Checking | |
| You can use this form to submit a benchmark for contamination checking. Submissions may include either a direct upload or a reference to a publicly available dataset on Hugging Face. | |
| ### Submission Guidelines: | |
| - **Benchmark Name**: Provide a name for your benchmark. | |
| - **Contributor**: Enter your name or affiliation. | |
| - **Data Source**: | |
| - Upload a `.jsonl` file containing your benchmark entries, or | |
| - Specify a Hugging Face dataset path (`author/benchmark-name`) along with the appropriate split (e.g., `test`, `validation`). | |
| - **Field Name**: Indicate the field to analyze for contamination: | |
| - For question-answering datasets: use the question field. | |
| - For language understanding tasks: use the context or passage field. | |
| ### What Happens Next: | |
| Once submitted, your benchmark will be queued for analysis. Results will be published in the **community** section of the bulletin. | |
| Processing time may vary depending on the dataset format and size. You can check the results by navigating to the **Bulletin** tab and selecting the **community** source, then clicking **Refresh**. | |
| ''') | |
| with gr.Row(): | |
| benchmark_name_input = gr.Textbox(label="Benchmark Name") | |
| contributor_input = gr.Textbox(label="Contributor") | |
| with gr.Row(): | |
| jsonl_input = gr.File(label="Upload .jsonl File", file_types=[".jsonl"]) | |
| with gr.Column(): | |
| hf_path_input = gr.Textbox(label="HuggingFace Dataset Path", placeholder="e.g., author/benchmark-name") | |
| hf_split_input = gr.Textbox(label="Dataset split (only if providing HuggingFace Dataset)", placeholder="e.g., validation, test") | |
| hf_config_input = gr.Textbox(label="Dataset Config (optional)", placeholder="name of dataset config") | |
| field_name_input = gr.Textbox(label="Context or Question Field Name", placeholder="e.g., context, question, ...") | |
| with gr.Row(): | |
| gr.LoginButton() | |
| submit_button = gr.Button("Submit for Contamination Check") | |
| result_output = gr.Textbox(label="Submission Status", interactive=False) | |
| submit_button.click( | |
| fn=record_submission, | |
| inputs=[benchmark_name_input, contributor_input, jsonl_input, hf_path_input, hf_split_input, field_name_input, hf_config_input], | |
| outputs=result_output, | |
| ) | |
| interface.launch() | |