Spaces:
Running
Running
Commit
·
cb51391
1
Parent(s):
e3793a3
maint: improve text
Browse files- README.md +6 -14
- constants.py +2 -2
- main.py +60 -26
README.md
CHANGED
|
@@ -11,20 +11,12 @@ short_description: 'TabArena'
|
|
| 11 |
sdk_version: 4.44.0
|
| 12 |
---
|
| 13 |
|
| 14 |
-
# TabArena Leaderboard
|
| 15 |
|
| 16 |
-
This repository contains the frontend code to display TabArena leaderboard.
|
| 17 |
-
HuggingFace space.
|
| 18 |
|
| 19 |
Reference:
|
| 20 |
-
*
|
| 21 |
-
*
|
| 22 |
-
*
|
| 23 |
-
|
| 24 |
-
TODOS:
|
| 25 |
-
* add regression/classif/multiclassif
|
| 26 |
-
|
| 27 |
-
DONE:
|
| 28 |
-
* readme title and information
|
| 29 |
-
* pull data from leaderboard
|
| 30 |
-
* update columns
|
|
|
|
| 11 |
sdk_version: 4.44.0
|
| 12 |
---
|
| 13 |
|
| 14 |
+
# TabArena Leaderboard Code
|
| 15 |
|
| 16 |
+
This repository contains the frontend code to display TabArena leaderboard.
|
| 17 |
+
The leaderboard is hosted on a HuggingFace space.
|
| 18 |
|
| 19 |
Reference:
|
| 20 |
+
* Website: tabarena.ai
|
| 21 |
+
* Paper: TBA
|
| 22 |
+
* TabArena Codebase: https://tabarena.ai/code
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
constants.py
CHANGED
|
@@ -4,9 +4,9 @@ class Constants:
|
|
| 4 |
foundational: str = "Foundation Model"
|
| 5 |
neural_network: str ="Neural Network"
|
| 6 |
baseline: str = "Baseline"
|
|
|
|
| 7 |
# Not Used
|
| 8 |
other: str = "Other"
|
| 9 |
-
automl: str = "AutoML"
|
| 10 |
|
| 11 |
model_type_emoji = {
|
| 12 |
Constants.tree: "🌳",
|
|
@@ -15,5 +15,5 @@ model_type_emoji = {
|
|
| 15 |
Constants.baseline: "📏",
|
| 16 |
# Not used
|
| 17 |
Constants.other: "❓",
|
| 18 |
-
Constants.
|
| 19 |
}
|
|
|
|
| 4 |
foundational: str = "Foundation Model"
|
| 5 |
neural_network: str ="Neural Network"
|
| 6 |
baseline: str = "Baseline"
|
| 7 |
+
reference: str ="Reference Pipeline"
|
| 8 |
# Not Used
|
| 9 |
other: str = "Other"
|
|
|
|
| 10 |
|
| 11 |
model_type_emoji = {
|
| 12 |
Constants.tree: "🌳",
|
|
|
|
| 15 |
Constants.baseline: "📏",
|
| 16 |
# Not used
|
| 17 |
Constants.other: "❓",
|
| 18 |
+
Constants.reference:"📊",
|
| 19 |
}
|
main.py
CHANGED
|
@@ -8,40 +8,73 @@ from apscheduler.schedulers.background import BackgroundScheduler
|
|
| 8 |
from constants import Constants, model_type_emoji
|
| 9 |
from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
|
| 10 |
|
| 11 |
-
TITLE = """<h1 align="center" id="space-title">TabArena
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
INTRODUCTION_TEXT = (
|
| 14 |
-
"TabArena Leaderboard measures the performance of tabular models on a collection of tabular "
|
| 15 |
-
"datasets manually curated. The datasets are collected to make sure they are tabular, with "
|
| 16 |
-
"permissive license without ethical issues and so on, we refer to the paper for a full "
|
| 17 |
-
"description of our approach."
|
| 18 |
-
)
|
| 19 |
|
| 20 |
ABOUT_TEXT = """
|
| 21 |
-
##
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
"""
|
| 31 |
|
| 32 |
CITATION_BUTTON_LABEL = (
|
| 33 |
-
"If you use
|
| 34 |
)
|
| 35 |
CITATION_BUTTON_TEXT = r"""
|
| 36 |
@article{
|
| 37 |
-
TBA,
|
| 38 |
}
|
| 39 |
"""
|
| 40 |
|
| 41 |
|
| 42 |
def get_model_family(model_name: str) -> str:
|
| 43 |
prefixes_mapping = {
|
| 44 |
-
Constants.
|
| 45 |
Constants.neural_network: ["REALMLP", "TabM", "FASTAI", "MNCA", "NN_TORCH"],
|
| 46 |
Constants.tree: ["GBM", "CAT", "EBM", "XGB", "XT", "RF"],
|
| 47 |
Constants.foundational: ["TABDPT", "TABICL", "TABPFN"],
|
|
@@ -88,7 +121,6 @@ def load_data(filename: str):
|
|
| 88 |
f"Loaded dataframe with {len(df_leaderboard)} rows and columns {df_leaderboard.columns}"
|
| 89 |
)
|
| 90 |
|
| 91 |
-
|
| 92 |
# add model family information
|
| 93 |
|
| 94 |
df_leaderboard["Type"] = df_leaderboard.loc[:, "method"].apply(
|
|
@@ -123,7 +155,9 @@ def load_data(filename: str):
|
|
| 123 |
]
|
| 124 |
|
| 125 |
# round for better display
|
| 126 |
-
df_leaderboard[["elo", "Elo 95% CI"]] = df_leaderboard[["elo", "Elo 95% CI"]].round(
|
|
|
|
|
|
|
| 127 |
df_leaderboard[["median_time_train_s_per_1K", "rank"]] = df_leaderboard[
|
| 128 |
["median_time_train_s_per_1K", "rank"]
|
| 129 |
].round(2)
|
|
@@ -139,7 +173,7 @@ def load_data(filename: str):
|
|
| 139 |
return df_leaderboard.rename(
|
| 140 |
columns={
|
| 141 |
"median_time_train_s_per_1K": "Median Train Time (s/1K) [⬇️]",
|
| 142 |
-
"median_time_infer_s_per_1K": "Median Predict Time (s/1K)
|
| 143 |
"method": "Model",
|
| 144 |
"elo": "Elo [⬆️]",
|
| 145 |
"rank": "Rank [⬇️]",
|
|
@@ -213,9 +247,9 @@ def make_leaderboard(df_leaderboard: pd.DataFrame) -> Leaderboard:
|
|
| 213 |
label="(Not) Imputed Models.",
|
| 214 |
info="We impute the performance for models that cannot run on all"
|
| 215 |
" datasets due to task or dataset size constraints (e.g. TabPFN,"
|
| 216 |
-
" TabICL). We impute with the performance of a
|
| 217 |
-
" We add a postfix [X% IMPUTED] to the model if any results were
|
| 218 |
-
"imputed. The X% shows the percentage of"
|
| 219 |
" datasets that were imputed. In general, imputation negatively"
|
| 220 |
" represents the model performance, punishing the model for not"
|
| 221 |
" being able to run on all datasets.",
|
|
@@ -232,7 +266,7 @@ def main():
|
|
| 232 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 233 |
|
| 234 |
with gr.Tabs(elem_classes="tab-buttons"):
|
| 235 |
-
with gr.TabItem("🏅
|
| 236 |
df_leaderboard = load_data("tabarena_leaderboard")
|
| 237 |
make_leaderboard(df_leaderboard)
|
| 238 |
|
|
|
|
| 8 |
from constants import Constants, model_type_emoji
|
| 9 |
from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
|
| 10 |
|
| 11 |
+
TITLE = """<h1 align="center" id="space-title">TabArena Leaderboard for Predictive Machine Learning on IID Tabular Data</h1>"""
|
| 12 |
+
|
| 13 |
+
INTRODUCTION_TEXT = """
|
| 14 |
+
TabArena is a living benchmark system for predictive machine learning on tabular data.
|
| 15 |
+
The goal of TabArena and its leaderboard is to asses the peak performance of
|
| 16 |
+
model-specific pipelines.
|
| 17 |
+
|
| 18 |
+
**Datasets:** Currently, the leaderboard is based on a manually curated collection of
|
| 19 |
+
51 tabular classification and regression datasets for independent and identically distributed
|
| 20 |
+
(IID) data, spanning the small to medium data regime. The datasets were carefully
|
| 21 |
+
curated to represent various real-world predictive machine learning use cases.
|
| 22 |
+
|
| 23 |
+
**Models:** The focus of the leaderboard is on model-specific pipelines. Each pipeline
|
| 24 |
+
is evaluated with default or tuned hyperparameter configuration or as an ensemble of
|
| 25 |
+
tuned configurations. Each model is implemented in a tested real-world pipeline that was
|
| 26 |
+
optimized to get the most out of the model by the maintainers of TabArena, and where
|
| 27 |
+
possible together with the authors of the model.
|
| 28 |
+
|
| 29 |
+
**Reference Pipeline:** The leaderboard includes a reference pipeline, which is applied
|
| 30 |
+
independently of the tuning protocol and constraints we constructed for models within TabArena.
|
| 31 |
+
The reference pipeline aims to represent the performance quickly achievable by a
|
| 32 |
+
practitioner on a dataset. The current reference pipeline is the predictive machine
|
| 33 |
+
learning system AutoGluon (version 1.3, with the best_quality preset and
|
| 34 |
+
4 hours for training). AutoGluon represents an ensemble pipeline across various model
|
| 35 |
+
types and thus provides a reference for model-specific pipelines.
|
| 36 |
+
|
| 37 |
+
The current leaderboard is based on TabArena-v0.1.
|
| 38 |
+
"""
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
ABOUT_TEXT = """
|
| 42 |
+
## Using TabArena for Benchmarking
|
| 43 |
+
To compare your own methods to the pre-computed results for all models on the leaderboard,
|
| 44 |
+
you can use the TabArena framework. For examples on how to use TabArena for benchmarking,
|
| 45 |
+
please see https://github.com/TabArena/tabarena_benchmarking_examples
|
| 46 |
+
|
| 47 |
+
## Contributing Data
|
| 48 |
+
For anything related to the datasets used in TabArena, please see https://github.com/TabArena/tabarena_dataset_curation
|
| 49 |
+
|
| 50 |
+
## Contributing to the Leaderboard
|
| 51 |
+
For guidelines on how to contribute the result of your model to the official leaderboard,
|
| 52 |
+
please see the appendix of our paper. <TODO: publish documentation>
|
| 53 |
+
|
| 54 |
+
## Contact The Maintainers
|
| 55 |
+
For any inquires related to TabArena, please reach out to: [email protected]
|
| 56 |
+
|
| 57 |
+
## Core Maintainers
|
| 58 |
+
The current core maintainers of TabArena are:
|
| 59 |
+
[Nick Erickson](https://github.com/Innixma),
|
| 60 |
+
[Lennart Purucker](https://github.com/LennartPurucker/),
|
| 61 |
+
[Andrej Tschalzev](https://github.com/atschalz),
|
| 62 |
+
[David Holzmüller](https://github.com/dholzmueller)
|
| 63 |
"""
|
| 64 |
|
| 65 |
CITATION_BUTTON_LABEL = (
|
| 66 |
+
"If you use TabArena or the leaderboard in your research please cite the following:"
|
| 67 |
)
|
| 68 |
CITATION_BUTTON_TEXT = r"""
|
| 69 |
@article{
|
| 70 |
+
TBA,
|
| 71 |
}
|
| 72 |
"""
|
| 73 |
|
| 74 |
|
| 75 |
def get_model_family(model_name: str) -> str:
|
| 76 |
prefixes_mapping = {
|
| 77 |
+
Constants.reference: ["AutoGluon"],
|
| 78 |
Constants.neural_network: ["REALMLP", "TabM", "FASTAI", "MNCA", "NN_TORCH"],
|
| 79 |
Constants.tree: ["GBM", "CAT", "EBM", "XGB", "XT", "RF"],
|
| 80 |
Constants.foundational: ["TABDPT", "TABICL", "TABPFN"],
|
|
|
|
| 121 |
f"Loaded dataframe with {len(df_leaderboard)} rows and columns {df_leaderboard.columns}"
|
| 122 |
)
|
| 123 |
|
|
|
|
| 124 |
# add model family information
|
| 125 |
|
| 126 |
df_leaderboard["Type"] = df_leaderboard.loc[:, "method"].apply(
|
|
|
|
| 155 |
]
|
| 156 |
|
| 157 |
# round for better display
|
| 158 |
+
df_leaderboard[["elo", "Elo 95% CI"]] = df_leaderboard[["elo", "Elo 95% CI"]].round(
|
| 159 |
+
0
|
| 160 |
+
)
|
| 161 |
df_leaderboard[["median_time_train_s_per_1K", "rank"]] = df_leaderboard[
|
| 162 |
["median_time_train_s_per_1K", "rank"]
|
| 163 |
].round(2)
|
|
|
|
| 173 |
return df_leaderboard.rename(
|
| 174 |
columns={
|
| 175 |
"median_time_train_s_per_1K": "Median Train Time (s/1K) [⬇️]",
|
| 176 |
+
"median_time_infer_s_per_1K": "Median Predict Time (s/1K) [⬇️]",
|
| 177 |
"method": "Model",
|
| 178 |
"elo": "Elo [⬆️]",
|
| 179 |
"rank": "Rank [⬇️]",
|
|
|
|
| 247 |
label="(Not) Imputed Models.",
|
| 248 |
info="We impute the performance for models that cannot run on all"
|
| 249 |
" datasets due to task or dataset size constraints (e.g. TabPFN,"
|
| 250 |
+
" TabICL). We impute with the performance of a default RandomForest."
|
| 251 |
+
" We add a postfix [X% IMPUTED] to the model if any results were"
|
| 252 |
+
" imputed. The X% shows the percentage of"
|
| 253 |
" datasets that were imputed. In general, imputation negatively"
|
| 254 |
" represents the model performance, punishing the model for not"
|
| 255 |
" being able to run on all datasets.",
|
|
|
|
| 266 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 267 |
|
| 268 |
with gr.Tabs(elem_classes="tab-buttons"):
|
| 269 |
+
with gr.TabItem("🏅 TabArena-v0.1", elem_id="llm-benchmark-tab-table", id=2):
|
| 270 |
df_leaderboard = load_data("tabarena_leaderboard")
|
| 271 |
make_leaderboard(df_leaderboard)
|
| 272 |
|