Spaces:

infini-gram-mini
/

Benchmark-Contamination-Monitoring-System

Running

App Files Files Community

Hao Xu commited on Jun 7

Commit

d56bc01

1 Parent(s): 490d0b1

UI update

Browse files

Files changed (2) hide show

app.py +58 -38
data.json +0 -1

app.py CHANGED Viewed

@@ -31,7 +31,6 @@ def load_data(source, refresh=False):
 def build_table(source, refresh=False):
     data = load_data(source, refresh)
-    entries = []
     if source == "core":
         headers = ["Benchmark", "Category", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)", "CC-2025-05 Dirty (%)"]
@@ -40,19 +39,25 @@ def build_table(source, refresh=False):
     html = """
     <table id="benchmarkTable" style="border-collapse: collapse; width: 100%;">
-    <thead><tr>
     """
     for col in headers:
-        html += f'<th style="border: 1px solid #ddd; padding: 8px; text-align: right;" onclick="sortTable(this)">{col} <span class="triangle"></span></th>'
-    html += '</tr></thead>\n<tbody>\n'
     for entry in data:
         name = entry.get("Benchmark", "")
         url = entry.get("URL", "#")
-        if url:
-            hyperlink = f'<a href="{url}" target="_blank">{name}</a>'
-        else:
-            hyperlink = name
         row = {
             "Benchmark": hyperlink,
@@ -70,10 +75,12 @@ def build_table(source, refresh=False):
         for col in headers:
             val = row.get(col, "")
             if isinstance(val, float) and val >= 0:
-                val = f"{val:5.1f}"
             elif isinstance(val, float):
-                val = "N/A"
-            html += f'<td style="border: 1px solid #ddd; padding: 8px; text-align: right;">{val}</td>'
         html += "</tr>\n"
     html += "</tbody></table>"
@@ -83,52 +90,67 @@ def build_table(source, refresh=False):
         let sortDirection = {};
         function sortTable(header) {
-            var table = document.getElementById("benchmarkTable");
-            var rows = Array.from(table.rows).slice(1);
-            var columnIndex = Array.from(header.parentNode.children).indexOf(header);
-            var isAscending = sortDirection[columnIndex] === 'ascending';
             sortDirection[columnIndex] = isAscending ? 'descending' : 'ascending';
-            var allHeaders = header.parentNode.children;
-            Array.from(allHeaders).forEach(th => {
-                th.querySelector('.triangle').classList.remove('ascending', 'descending');
             });
-            header.querySelector('.triangle').classList.add(sortDirection[columnIndex]);
-            rows.sort(function(rowA, rowB) {
-                var cellA = rowA.cells[columnIndex].innerText;
-                var cellB = rowB.cells[columnIndex].innerText;
                 if (isNaN(cellA)) {
                     return isAscending ? cellA.localeCompare(cellB) : cellB.localeCompare(cellA);
                 }
                 return isAscending ? parseFloat(cellA) - parseFloat(cellB) : parseFloat(cellB) - parseFloat(cellA);
             });
-            for (var i = 0; i < rows.length; i++) {
-                table.appendChild(rows[i]);
-            }
         }
     </script>
     """
     html += """
     <style>
-        .triangle {
             display: inline-block;
             width: 0;
             height: 0;
             border-left: 5px solid transparent;
             border-right: 5px solid transparent;
-            margin-left: 5px;
-            transition: transform 0.2s;
         }
-        .ascending {
             border-bottom: 5px solid #000;
         }
-        .descending {
             border-top: 5px solid #000;
         }
     </style>
@@ -231,9 +253,9 @@ with gr.Blocks() as interface:
     gr.HTML(
             '''<h1 text-align="center">📖 Benchmark Contamination Monitoring System</h1>
-            <p style='font-size: 16px;'>This system monitors potential contamination in benchmark datasets used for evaluating language models across various open-source corpora.</p>
             <p style='font-size: 16px;'>The system is released along with our paper Infini-gram mini: Exact n-gram Search at the Internet Scale with FM-Index, which documents the methodology and findings in detail.</p>
-            <p style='font-size: 16px;'>We invite the community to contribute by submitting new benchmarks for contamination analysis using the form available in the <b>"Add New Benchmarks"</b> tab.</p>
             '''
         )
@@ -244,11 +266,9 @@ with gr.Blocks() as interface:
                 gr.Markdown('''
                 The **Benchmark Contamination Bulletin** presents contamination statistics for evaluation benchmarks across different data sources.
-                - Benchmarks analyzed in our accompanying paper are listed under the **core** source.
-                - User-submitted benchmarks appear under the **community** source.
-                - The contamination rate represents the percentage of benchmark entries identified as *dirty* based on our detection criteria.
                 - The bulletin will be updated regularly to include contamination checks on newly released Common Crawl dumps.
-                - You can sort the results by clicking on the column headers.
                 ''')
             source_radio = gr.Radio(

 def build_table(source, refresh=False):
     data = load_data(source, refresh)
     if source == "core":
         headers = ["Benchmark", "Category", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)", "CC-2025-05 Dirty (%)"]
     html = """
     <table id="benchmarkTable" style="border-collapse: collapse; width: 100%;">
+    <thead>
+    <tr>
     """
     for col in headers:
+        html += f'''
+        <th onclick="sortTable(this)" style="cursor: pointer; border: 1px solid #ddd; padding: 8px; text-align: right;">
+            {col}
+            <span class="tri-container">
+                <span class="triangle-up"></span>
+                <span class="triangle-down"></span>
+            </span>
+        </th>
+        '''
+    html += "</tr></thead><tbody>"
     for entry in data:
         name = entry.get("Benchmark", "")
         url = entry.get("URL", "#")
+        hyperlink = f'<a href="{url}" target="_blank">{name}</a>' if url else name
         row = {
             "Benchmark": hyperlink,
         for col in headers:
             val = row.get(col, "")
             if isinstance(val, float) and val >= 0:
+                val_display = f"{val:5.1f}"
+                html += f'<td style="border: 1px solid #ddd; padding: 8px; text-align: right;">{val_display}</td>'
             elif isinstance(val, float):
+                html += f'<td style="border: 1px solid #ddd; padding: 8px; text-align: right;">N/A</td>'
+            else:
+                html += f'<td style="border: 1px solid #ddd; padding: 8px; text-align: right;">{val}</td>'
         html += "</tr>\n"
     html += "</tbody></table>"
         let sortDirection = {};
         function sortTable(header) {
+            const table = document.getElementById("benchmarkTable");
+            const rows = Array.from(table.tBodies[0].rows);
+            const columnIndex = Array.from(header.parentNode.children).indexOf(header);
+            const isAscending = sortDirection[columnIndex] === 'ascending';
             sortDirection[columnIndex] = isAscending ? 'descending' : 'ascending';
+            Array.from(header.parentNode.children).forEach(th => {
+                const up = th.querySelector('.triangle-up');
+                const down = th.querySelector('.triangle-down');
+                if (up) up.classList.remove('active');
+                if (down) down.classList.remove('active');
             });
+            if (sortDirection[columnIndex] === 'ascending') {
+                header.querySelector('.triangle-up').classList.add('active');
+            } else {
+                header.querySelector('.triangle-down').classList.add('active');
+            }
+            rows.sort((rowA, rowB) => {
+                const cellA = rowA.cells[columnIndex].innerText;
+                const cellB = rowB.cells[columnIndex].innerText;
                 if (isNaN(cellA)) {
                     return isAscending ? cellA.localeCompare(cellB) : cellB.localeCompare(cellA);
                 }
                 return isAscending ? parseFloat(cellA) - parseFloat(cellB) : parseFloat(cellB) - parseFloat(cellA);
             });
+            rows.forEach(row => table.tBodies[0].appendChild(row));
         }
     </script>
     """
     html += """
     <style>
+        thead tr {
+            background-color: #f0f0f0;
+        }
+        .tri-container {
             display: inline-block;
+            margin-left: 4px;
+            vertical-align: middle;
+        }
+        .triangle-up, .triangle-down {
+            display: block;
             width: 0;
             height: 0;
+            margin: 1px auto;
             border-left: 5px solid transparent;
             border-right: 5px solid transparent;
         }
+        .triangle-up {
+            border-bottom: 5px solid #999;
+        }
+        .triangle-down {
+            border-top: 5px solid #999;
+        }
+        .triangle-up.active {
             border-bottom: 5px solid #000;
         }
+        .triangle-down.active {
             border-top: 5px solid #000;
         }
     </style>
     gr.HTML(
             '''<h1 text-align="center">📖 Benchmark Contamination Monitoring System</h1>
+            <p style='font-size: 16px;'>This system monitors potential contamination in benchmark datasets used for evaluating language models across various open-source corpora 🧐.</p>
             <p style='font-size: 16px;'>The system is released along with our paper Infini-gram mini: Exact n-gram Search at the Internet Scale with FM-Index, which documents the methodology and findings in detail.</p>
+            <p style='font-size: 16px;'>We welcome the community to submit new benchmarks for contamination analysis using the <b>"Add New Benchmarks"</b> tab.</p>
             '''
         )
                 gr.Markdown('''
                 The **Benchmark Contamination Bulletin** presents contamination statistics for evaluation benchmarks across different data sources.
+                - Benchmarks analyzed in our paper are under the **core** source. Community-submitted benchmarks appear under the **community** source.
+                - The contamination rate represents the percentage of *dirty* benchmark entries.
                 - The bulletin will be updated regularly to include contamination checks on newly released Common Crawl dumps.
                 ''')
             source_radio = gr.Radio(

data.json CHANGED Viewed

@@ -22,7 +22,6 @@
   {"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.1, "DCLM Dirty": 1.0, "CC202505 Dirty": 0.1, "URL": "https://huggingface.co/datasets/tau/commonsense_qa"},
   {"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/Rowan/hellaswag"},
   {"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.8, "DCLM Dirty": 15.6, "CC202505 Dirty": 14.6, "URL": "https://huggingface.co/datasets/allenai/openbookqa"},
-  {"Benchmark": "PIQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": ""},
   {"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.5, "CC202505 Dirty": 0.2, "URL": "https://huggingface.co/datasets/allenai/social_i_qa"},
   {"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/allenai/winogrande"},

   {"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.1, "DCLM Dirty": 1.0, "CC202505 Dirty": 0.1, "URL": "https://huggingface.co/datasets/tau/commonsense_qa"},
   {"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/Rowan/hellaswag"},
   {"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.8, "DCLM Dirty": 15.6, "CC202505 Dirty": 14.6, "URL": "https://huggingface.co/datasets/allenai/openbookqa"},
   {"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.5, "CC202505 Dirty": 0.2, "URL": "https://huggingface.co/datasets/allenai/social_i_qa"},
   {"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/allenai/winogrande"},