huckiyang commited on
Commit
97984bb
Β·
1 Parent(s): 97dd638

[release] speechIQ

Browse files
Files changed (3) hide show
  1. SpeechIQ_table.csv +14 -0
  2. app.py +148 -169
  3. src/about.py +71 -47
SpeechIQ_table.csv ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model Type,Setup,Audio Encoder,Remember,Understand,Apply,Speech IQ
2
+ Agentic: ASR + LLM,Whisper_v2-1.5B + Qwen2_7B,Whisper_v2-1.5B,0.554,0.499,0.481,107.43
3
+ Agentic: ASR + LLM,Whisper_v3-1.5B + Qwen2_7B,Whisper_v2-1.5B,0.553,0.433,0.432,106.49
4
+ Agentic: ASR + LLM,Canary_1B + Qwen2_7B,Whisper_v2-1.5B,0.559,0.566,0.504,107.78
5
+ Agentic: ASR + LLM,OWSM-CTC_v3.1-1B + Qwen2_7B,OWSM-CTC_v3.1-1B,0.534,0.151,0.353,103.05
6
+ Agentic: ASR + GER + LLM,Whisper_v2-1.5B + GPT-4o + Qwen2_7B,Whisper_v2-1.5B,0.543,0.632,0.487,108.64
7
+ End2End,Qwen2-Audio_7B ,1.5B Whisper,-0.187,0.366,0.011,103.88
8
+ End2End,Qwen2.5-Omni_7B ,1.5B Whisper,0.472,0.41,0.509,105.74
9
+ End2End,Salmonn_13B ,1.5B Whisper,0.508,0.381,-1.146,101.03
10
+ End2End,Desta2_8B,1.5B Whisper,-2.575,-1.604,-0.233,79.69
11
+ End2End,AnyGPT_7B,SpeechTokenizer,0.314,-2.718,-2.893,60.02
12
+ End2End,Baichuan-omni-1.5_7B,1.5B Whisper,0.448,0.184,0.546,104.02
13
+ End2End,Gemini-1.5-flash,Google_USM,-1.885,0.641,0.673,107.85
14
+ End2End,Gemini-1.5-pro,Google_USM,0.492,0.409,0.71,107.08
app.py CHANGED
@@ -1,8 +1,6 @@
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
- from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
 
7
  from src.about import (
8
  CITATION_BUTTON_LABEL,
@@ -13,192 +11,173 @@ from src.about import (
13
  TITLE,
14
  )
15
  from src.display.css_html_js import custom_css
16
- from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
- AutoEvalColumn,
22
- ModelType,
23
- fields,
24
- WeightType,
25
- Precision
26
- )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
- from src.submission.submit import add_new_eval
30
-
31
-
32
- def restart_space():
33
- API.restart_space(repo_id=REPO_ID)
34
-
35
- ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
- try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
48
- except Exception:
49
- restart_space()
50
 
51
-
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
-
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
-
60
- def init_leaderboard(dataframe):
61
- if dataframe is None or dataframe.empty:
62
- raise ValueError("Leaderboard DataFrame is empty or None.")
63
- return Leaderboard(
64
- value=dataframe,
65
- datatype=[c.type for c in fields(AutoEvalColumn)],
66
- select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
- label="Select Columns to Display:",
70
- ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
- filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
- ],
87
- bool_checkboxgroup_label="Hide models",
88
  interactive=False,
 
 
 
89
  )
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
- demo = gr.Blocks(css=custom_css)
93
  with demo:
94
  gr.HTML(TITLE)
95
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
 
97
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
100
-
101
- with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
102
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
-
104
- with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
  with gr.Row():
145
- gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
146
-
 
147
  with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
158
-
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
- )
190
 
 
191
  with gr.Row():
192
  with gr.Accordion("πŸ“™ Citation", open=False):
193
  citation_button = gr.Textbox(
194
  value=CITATION_BUTTON_TEXT,
195
  label=CITATION_BUTTON_LABEL,
196
- lines=20,
197
  elem_id="citation-button",
198
  show_copy_button=True,
199
  )
200
 
201
- scheduler = BackgroundScheduler()
202
- scheduler.add_job(restart_space, "interval", seconds=1800)
203
- scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
2
  import pandas as pd
3
+ import numpy as np
 
4
 
5
  from src.about import (
6
  CITATION_BUTTON_LABEL,
 
11
  TITLE,
12
  )
13
  from src.display.css_html_js import custom_css
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ def load_speechiq_data():
16
+ """Load and process the SpeechIQ results from CSV file."""
17
+ try:
18
+ df = pd.read_csv("SpeechIQ_table.csv")
19
+
20
+ # Round numerical columns to 3 decimal places for better display
21
+ numerical_cols = ['Remember', 'Understand', 'Apply', 'Speech IQ']
22
+ for col in numerical_cols:
23
+ if col in df.columns:
24
+ df[col] = df[col].round(3)
25
+
26
+ # Sort by Speech IQ score in descending order
27
+ df = df.sort_values('Speech IQ', ascending=False)
28
+
29
+ return df
30
+ except Exception as e:
31
+ print(f"Error loading SpeechIQ data: {e}")
32
+ # Return empty dataframe with expected columns if file not found
33
+ return pd.DataFrame(columns=['Model Type', 'Setup', 'Audio Encoder', 'Remember', 'Understand', 'Apply', 'Speech IQ'])
34
+
35
+ def create_leaderboard_table(df):
36
+ """Create a formatted leaderboard table with color coding."""
37
+ if df.empty:
38
+ return gr.Dataframe(
39
+ value=df,
40
+ headers=['Model Type', 'Setup', 'Audio Encoder', 'Remember', 'Understand', 'Apply', 'Speech IQ'],
41
+ interactive=False
42
+ )
43
+
44
+ return gr.Dataframe(
45
+ value=df,
46
+ headers=df.columns.tolist(),
 
 
 
 
 
47
  interactive=False,
48
+ wrap=True,
49
+ column_widths=["15%", "25%", "15%", "11%", "11%", "11%", "12%"],
50
+ height=600
51
  )
52
 
53
+ def get_top_performers(df):
54
+ """Get statistics about top performers."""
55
+ if df.empty:
56
+ return "No data available."
57
+
58
+ top_score = df['Speech IQ'].max()
59
+ top_model = df.loc[df['Speech IQ'].idxmax()]
60
+
61
+ agentic_best = df[df['Model Type'].str.contains('Agentic', na=False)]['Speech IQ'].max() if not df[df['Model Type'].str.contains('Agentic', na=False)].empty else 0
62
+ end2end_best = df[df['Model Type'].str.contains('End2End', na=False)]['Speech IQ'].max() if not df[df['Model Type'].str.contains('End2End', na=False)].empty else 0
63
+
64
+ stats_text = f"""
65
+ ### πŸ“Š Leaderboard Statistics
66
+
67
+ **πŸ† Top Performer:** {top_model['Setup']} (Score: {top_score})
68
+
69
+ **πŸ€– Best Agentic Model:** {agentic_best}
70
+
71
+ **πŸ”„ Best End2End Model:** {end2end_best}
72
+
73
+ **πŸ“ˆ Total Models Evaluated:** {len(df)}
74
+ """
75
+
76
+ return stats_text
77
+
78
+ # Load the data
79
+ speechiq_df = load_speechiq_data()
80
+
81
+ # Create the Gradio interface
82
+ demo = gr.Blocks(css=custom_css, title="SpeechIQ Leaderboard")
83
 
 
84
  with demo:
85
  gr.HTML(TITLE)
86
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
87
 
88
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
89
+ with gr.TabItem("πŸ… SpeechIQ Leaderboard", elem_id="speechiq-leaderboard-tab", id=0):
90
+
91
+ # Statistics section
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  with gr.Row():
93
+ gr.Markdown(get_top_performers(speechiq_df), elem_classes="markdown-text")
94
+
95
+ # Main leaderboard table
96
  with gr.Row():
97
+ leaderboard_table = create_leaderboard_table(speechiq_df)
98
+
99
+ # Legend and explanation
100
+ with gr.Row():
101
+ gr.Markdown("""
102
+ ### πŸ“‹ Column Explanations
103
+
104
+ - **Model Type**: Architecture approach (Agentic vs End2End)
105
+ - **Setup**: Specific model configuration and components
106
+ - **Audio Encoder**: The audio processing component used
107
+ - **Remember**: Verbatim accuracy score (WER-based)
108
+ - **Understand**: Semantic interpretation similarity score
109
+ - **Apply**: Downstream task performance score
110
+ - **Speech IQ**: Overall intelligence quotient combining all dimensions
111
+
112
+ *Higher scores indicate better performance across all metrics.*
113
+ """, elem_classes="markdown-text")
114
+
115
+ with gr.TabItem("πŸ“Š Analysis", elem_id="analysis-tab", id=1):
116
+ with gr.Row():
117
+ # Create performance comparison charts
118
+ if not speechiq_df.empty:
119
+ # Group by model type for comparison
120
+ agentic_models = speechiq_df[speechiq_df['Model Type'].str.contains('Agentic', na=False)]
121
+ end2end_models = speechiq_df[speechiq_df['Model Type'].str.contains('End2End', na=False)]
122
+
123
+ comparison_text = f"""
124
+ ### πŸ” Model Type Comparison
125
+
126
+ **Agentic Models (ASR + LLM):**
127
+ - Count: {len(agentic_models)}
128
+ - Average Speech IQ: {agentic_models['Speech IQ'].mean():.2f}
129
+ - Best Score: {agentic_models['Speech IQ'].max():.2f}
130
+
131
+ **End-to-End Models:**
132
+ - Count: {len(end2end_models)}
133
+ - Average Speech IQ: {end2end_models['Speech IQ'].mean():.2f}
134
+ - Best Score: {end2end_models['Speech IQ'].max():.2f}
135
+
136
+ ### 🎯 Cognitive Dimension Analysis
137
+
138
+ **Remember (Verbatim Accuracy):**
139
+ - Best performer: {speechiq_df.loc[speechiq_df['Remember'].idxmax(), 'Setup']} ({speechiq_df['Remember'].max():.3f})
140
+
141
+ **Understand (Semantic Similarity):**
142
+ - Best performer: {speechiq_df.loc[speechiq_df['Understand'].idxmax(), 'Setup']} ({speechiq_df['Understand'].max():.3f})
143
+
144
+ **Apply (Task Performance):**
145
+ - Best performer: {speechiq_df.loc[speechiq_df['Apply'].idxmax(), 'Setup']} ({speechiq_df['Apply'].max():.3f})
146
+ """
147
+
148
+ gr.Markdown(comparison_text, elem_classes="markdown-text")
149
+ else:
150
+ gr.Markdown("No data available for analysis.", elem_classes="markdown-text")
151
+
152
+ with gr.TabItem("πŸ“ About", elem_id="about-tab", id=2):
153
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
154
 
155
+ with gr.TabItem("πŸš€ Submit", elem_id="submit-tab", id=3):
156
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
+ # Citation section
159
  with gr.Row():
160
  with gr.Accordion("πŸ“™ Citation", open=False):
161
  citation_button = gr.Textbox(
162
  value=CITATION_BUTTON_TEXT,
163
  label=CITATION_BUTTON_LABEL,
164
+ lines=6,
165
  elem_id="citation-button",
166
  show_copy_button=True,
167
  )
168
 
169
+ # Add refresh functionality
170
+ with gr.Row():
171
+ refresh_button = gr.Button("πŸ”„ Refresh Data", variant="secondary")
172
+
173
+ def refresh_data():
174
+ updated_df = load_speechiq_data()
175
+ return create_leaderboard_table(updated_df), get_top_performers(updated_df)
176
+
177
+ refresh_button.click(
178
+ refresh_data,
179
+ outputs=[leaderboard_table, gr.Markdown()]
180
+ )
181
+
182
+ if __name__ == "__main__":
183
+ demo.launch(share=False, server_name="0.0.0.0", server_port=7860)
src/about.py CHANGED
@@ -1,72 +1,96 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
4
- @dataclass
5
- class Task:
6
- benchmark: str
7
- metric: str
8
- col_name: str
9
-
10
-
11
- # Select your tasks here
12
- # ---------------------------------------------------
13
- class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
17
 
18
- NUM_FEWSHOT = 0 # Change with your few shot
19
- # ---------------------------------------------------
 
20
 
 
21
 
 
22
 
23
- # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
 
25
 
26
- # What does your leaderboard evaluate?
27
- INTRODUCTION_TEXT = """
28
- Intro text
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
32
- LLM_BENCHMARKS_TEXT = f"""
33
- ## How it works
 
 
 
 
34
 
35
- ## Reproducibility
36
- To reproduce our results, here is the commands you can run:
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  """
39
 
40
  EVALUATION_QUEUE_TEXT = """
41
- ## Some good practices before submitting a model
42
 
43
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
44
- ```python
45
- from transformers import AutoConfig, AutoModel, AutoTokenizer
46
- config = AutoConfig.from_pretrained("your model name", revision=revision)
47
- model = AutoModel.from_pretrained("your model name", revision=revision)
48
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
49
- ```
50
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
51
 
52
- Note: make sure your model is public!
53
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
 
 
54
 
55
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
56
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
 
 
57
 
58
- ### 3) Make sure your model has an open license!
59
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model πŸ€—
 
 
 
60
 
61
- ### 4) Fill up your model card
62
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
 
 
 
 
63
 
64
- ## In case of model failure
65
- If your model is displayed in the `FAILED` category, its execution stopped.
66
- Make sure you have followed the above steps first.
67
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
68
  """
69
 
70
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
71
- CITATION_BUTTON_TEXT = r"""
72
- """
 
 
 
 
 
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
4
+ # Your leaderboard name
5
+ TITLE = """<h1 align="center" id="space-title">πŸŽ™οΈ Speech Intelligence Quotient (SpeechIQ) Leaderboard</h1>"""
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ # What does your leaderboard evaluate?
8
+ INTRODUCTION_TEXT = """
9
+ ## 🎯 Welcome to the Speech Intelligence Quotient (SpeechIQ) Leaderboard!
10
 
11
+ This leaderboard presents evaluation results for voice understanding large language models (LLM<sub>Voice</sub>) using our novel SpeechIQ evaluation framework.
12
 
13
+ **SpeechIQ** is a human cognition-inspired evaluation pipeline that assesses voice understanding abilities across three cognitive levels based on Bloom's Taxonomy:
14
 
15
+ - **🧠 Remembering**: Verbatim accuracy (WER-based)
16
+ - **πŸ’‘ Understanding**: Similarity of LLM's interpretations
17
+ - **πŸš€ Application**: QA accuracy for downstream tasks
18
 
19
+ The **Speech IQ Score** provides a unified metric for comparing both cascaded methods (ASR+LLM) and end-to-end models.
 
 
20
  """
21
 
22
  # Which evaluations are you running? how can people reproduce what you have?
23
+ LLM_BENCHMARKS_TEXT = """
24
+ ## πŸ“Š About SpeechIQ Evaluation
25
+
26
+ **Speech Intelligence Quotient (SpeechIQ)** represents a first-of-its-kind intelligence examination that bridges cognitive principles with voice-oriented benchmarks. Our framework moves beyond traditional metrics like Word Error Rate (WER) to provide comprehensive evaluation of voice understanding capabilities.
27
+
28
+ ### 🎯 Evaluation Framework
29
 
30
+ SpeechIQ evaluates models across three cognitive dimensions inspired by Bloom's Taxonomy:
 
31
 
32
+ 1. **Remember** (Verbatim Accuracy): Tests the model's ability to accurately capture spoken content
33
+ 2. **Understand** (Interpretation Similarity): Evaluates how well the model comprehends the meaning of speech
34
+ 3. **Apply** (Downstream Performance): Measures the model's ability to use speech understanding for practical tasks
35
+
36
+ ### πŸ† Model Categories
37
+
38
+ - **Agentic (ASR + LLM)**: Cascaded approaches using separate ASR and LLM components
39
+ - **End2End**: Direct speech-to-text models that process audio end-to-end
40
+
41
+ ### πŸ”¬ Key Benefits
42
+
43
+ - **Unified Comparisons**: Compare cascaded and end-to-end approaches on equal footing
44
+ - **Error Detection**: Identify annotation errors in existing benchmarks
45
+ - **Hallucination Detection**: Detect and quantify hallucinations in voice LLMs
46
+ - **Cognitive Assessment**: Map model capabilities to human cognitive principles
47
+
48
+ ### πŸ“ˆ Speech IQ Score
49
+
50
+ The final Speech IQ Score combines performance across all three dimensions to provide a comprehensive measure of voice understanding intelligence.
51
+
52
+ ## πŸ”„ Reproducibility
53
+
54
+ For detailed methodology and reproduction instructions, please refer to our paper and codebase.
55
  """
56
 
57
  EVALUATION_QUEUE_TEXT = """
58
+ ## πŸš€ Submit Your Model for SpeechIQ Evaluation
59
 
60
+ To submit your voice understanding model for SpeechIQ evaluation:
 
 
 
 
 
 
 
61
 
62
+ ### 1) Ensure Model Compatibility
63
+ Make sure your model can process audio inputs and generate text outputs in one of these formats:
64
+ - **ASR + LLM**: Separate ASR and LLM components
65
+ - **End-to-End**: Direct audio-to-text processing
66
 
67
+ ### 2) Model Requirements
68
+ - Model must be publicly accessible
69
+ - Provide clear documentation of audio input format and expected outputs
70
+ - Include information about audio encoder specifications
71
 
72
+ ### 3) Evaluation Domains
73
+ Your model will be evaluated across:
74
+ - **Remember**: Transcription accuracy
75
+ - **Understand**: Semantic understanding
76
+ - **Apply**: Task-specific performance
77
 
78
+ ### 4) Documentation
79
+ Please provide:
80
+ - Model architecture details
81
+ - Training data information
82
+ - Audio preprocessing requirements
83
+ - Expected input/output formats
84
 
85
+ ## πŸ“§ Contact
86
+
87
+ For questions about SpeechIQ evaluation or to submit your model, please contact the research team.
 
88
  """
89
 
90
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
91
+ CITATION_BUTTON_TEXT = r"""@article{speechiq2024,
92
+ title={Speech Intelligence Quotient (SpeechIQ): A Human Cognition-Inspired Evaluation Framework for Voice Understanding Large Language Models},
93
+ author={[Authors]},
94
+ journal={[Journal/Conference]},
95
+ year={2024}
96
+ }"""