Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	[release] speechIQ
Browse files- SpeechIQ_table.csv +14 -0
- app.py +148 -169
- src/about.py +71 -47
    	
        SpeechIQ_table.csv
    ADDED
    
    | @@ -0,0 +1,14 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            Model Type,Setup,Audio Encoder,Remember,Understand,Apply,Speech IQ
         | 
| 2 | 
            +
            Agentic: ASR + LLM,Whisper_v2-1.5B + Qwen2_7B,Whisper_v2-1.5B,0.554,0.499,0.481,107.43
         | 
| 3 | 
            +
            Agentic: ASR + LLM,Whisper_v3-1.5B + Qwen2_7B,Whisper_v2-1.5B,0.553,0.433,0.432,106.49
         | 
| 4 | 
            +
            Agentic: ASR + LLM,Canary_1B + Qwen2_7B,Whisper_v2-1.5B,0.559,0.566,0.504,107.78
         | 
| 5 | 
            +
            Agentic: ASR + LLM,OWSM-CTC_v3.1-1B + Qwen2_7B,OWSM-CTC_v3.1-1B,0.534,0.151,0.353,103.05
         | 
| 6 | 
            +
            Agentic: ASR + GER + LLM,Whisper_v2-1.5B + GPT-4o + Qwen2_7B,Whisper_v2-1.5B,0.543,0.632,0.487,108.64
         | 
| 7 | 
            +
            End2End,Qwen2-Audio_7B ,1.5B Whisper,-0.187,0.366,0.011,103.88
         | 
| 8 | 
            +
            End2End,Qwen2.5-Omni_7B ,1.5B Whisper,0.472,0.41,0.509,105.74
         | 
| 9 | 
            +
            End2End,Salmonn_13B ,1.5B Whisper,0.508,0.381,-1.146,101.03
         | 
| 10 | 
            +
            End2End,Desta2_8B,1.5B Whisper,-2.575,-1.604,-0.233,79.69
         | 
| 11 | 
            +
            End2End,AnyGPT_7B,SpeechTokenizer,0.314,-2.718,-2.893,60.02
         | 
| 12 | 
            +
            End2End,Baichuan-omni-1.5_7B,1.5B Whisper,0.448,0.184,0.546,104.02
         | 
| 13 | 
            +
            End2End,Gemini-1.5-flash,Google_USM,-1.885,0.641,0.673,107.85
         | 
| 14 | 
            +
            End2End,Gemini-1.5-pro,Google_USM,0.492,0.409,0.71,107.08
         | 
    	
        app.py
    CHANGED
    
    | @@ -1,8 +1,6 @@ | |
| 1 | 
             
            import gradio as gr
         | 
| 2 | 
            -
            from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
         | 
| 3 | 
             
            import pandas as pd
         | 
| 4 | 
            -
             | 
| 5 | 
            -
            from huggingface_hub import snapshot_download
         | 
| 6 |  | 
| 7 | 
             
            from src.about import (
         | 
| 8 | 
             
                CITATION_BUTTON_LABEL,
         | 
| @@ -13,192 +11,173 @@ from src.about import ( | |
| 13 | 
             
                TITLE,
         | 
| 14 | 
             
            )
         | 
| 15 | 
             
            from src.display.css_html_js import custom_css
         | 
| 16 | 
            -
            from src.display.utils import (
         | 
| 17 | 
            -
                BENCHMARK_COLS,
         | 
| 18 | 
            -
                COLS,
         | 
| 19 | 
            -
                EVAL_COLS,
         | 
| 20 | 
            -
                EVAL_TYPES,
         | 
| 21 | 
            -
                AutoEvalColumn,
         | 
| 22 | 
            -
                ModelType,
         | 
| 23 | 
            -
                fields,
         | 
| 24 | 
            -
                WeightType,
         | 
| 25 | 
            -
                Precision
         | 
| 26 | 
            -
            )
         | 
| 27 | 
            -
            from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
         | 
| 28 | 
            -
            from src.populate import get_evaluation_queue_df, get_leaderboard_df
         | 
| 29 | 
            -
            from src.submission.submit import add_new_eval
         | 
| 30 | 
            -
             | 
| 31 | 
            -
             | 
| 32 | 
            -
            def restart_space():
         | 
| 33 | 
            -
                API.restart_space(repo_id=REPO_ID)
         | 
| 34 | 
            -
             | 
| 35 | 
            -
            ### Space initialisation
         | 
| 36 | 
            -
            try:
         | 
| 37 | 
            -
                print(EVAL_REQUESTS_PATH)
         | 
| 38 | 
            -
                snapshot_download(
         | 
| 39 | 
            -
                    repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
         | 
| 40 | 
            -
                )
         | 
| 41 | 
            -
            except Exception:
         | 
| 42 | 
            -
                restart_space()
         | 
| 43 | 
            -
            try:
         | 
| 44 | 
            -
                print(EVAL_RESULTS_PATH)
         | 
| 45 | 
            -
                snapshot_download(
         | 
| 46 | 
            -
                    repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
         | 
| 47 | 
            -
                )
         | 
| 48 | 
            -
            except Exception:
         | 
| 49 | 
            -
                restart_space()
         | 
| 50 |  | 
| 51 | 
            -
             | 
| 52 | 
            -
             | 
| 53 | 
            -
             | 
| 54 | 
            -
            (
         | 
| 55 | 
            -
             | 
| 56 | 
            -
             | 
| 57 | 
            -
             | 
| 58 | 
            -
             | 
| 59 | 
            -
             | 
| 60 | 
            -
             | 
| 61 | 
            -
             | 
| 62 | 
            -
                     | 
| 63 | 
            -
             | 
| 64 | 
            -
                     | 
| 65 | 
            -
                     | 
| 66 | 
            -
             | 
| 67 | 
            -
             | 
| 68 | 
            -
             | 
| 69 | 
            -
             | 
| 70 | 
            -
             | 
| 71 | 
            -
             | 
| 72 | 
            -
             | 
| 73 | 
            -
             | 
| 74 | 
            -
             | 
| 75 | 
            -
                         | 
| 76 | 
            -
                         | 
| 77 | 
            -
             | 
| 78 | 
            -
             | 
| 79 | 
            -
             | 
| 80 | 
            -
             | 
| 81 | 
            -
             | 
| 82 | 
            -
             | 
| 83 | 
            -
                        ColumnFilter(
         | 
| 84 | 
            -
                            AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
         | 
| 85 | 
            -
                        ),
         | 
| 86 | 
            -
                    ],
         | 
| 87 | 
            -
                    bool_checkboxgroup_label="Hide models",
         | 
| 88 | 
             
                    interactive=False,
         | 
|  | |
|  | |
|  | |
| 89 | 
             
                )
         | 
| 90 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 91 |  | 
| 92 | 
            -
            demo = gr.Blocks(css=custom_css)
         | 
| 93 | 
             
            with demo:
         | 
| 94 | 
             
                gr.HTML(TITLE)
         | 
| 95 | 
             
                gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
         | 
| 96 |  | 
| 97 | 
             
                with gr.Tabs(elem_classes="tab-buttons") as tabs:
         | 
| 98 | 
            -
                    with gr.TabItem("π
  | 
| 99 | 
            -
                         | 
| 100 | 
            -
             | 
| 101 | 
            -
                    with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2):
         | 
| 102 | 
            -
                        gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
         | 
| 103 | 
            -
             | 
| 104 | 
            -
                    with gr.TabItem("π Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
         | 
| 105 | 
            -
                        with gr.Column():
         | 
| 106 | 
            -
                            with gr.Row():
         | 
| 107 | 
            -
                                gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
         | 
| 108 | 
            -
             | 
| 109 | 
            -
                            with gr.Column():
         | 
| 110 | 
            -
                                with gr.Accordion(
         | 
| 111 | 
            -
                                    f"β
 Finished Evaluations ({len(finished_eval_queue_df)})",
         | 
| 112 | 
            -
                                    open=False,
         | 
| 113 | 
            -
                                ):
         | 
| 114 | 
            -
                                    with gr.Row():
         | 
| 115 | 
            -
                                        finished_eval_table = gr.components.Dataframe(
         | 
| 116 | 
            -
                                            value=finished_eval_queue_df,
         | 
| 117 | 
            -
                                            headers=EVAL_COLS,
         | 
| 118 | 
            -
                                            datatype=EVAL_TYPES,
         | 
| 119 | 
            -
                                            row_count=5,
         | 
| 120 | 
            -
                                        )
         | 
| 121 | 
            -
                                with gr.Accordion(
         | 
| 122 | 
            -
                                    f"π Running Evaluation Queue ({len(running_eval_queue_df)})",
         | 
| 123 | 
            -
                                    open=False,
         | 
| 124 | 
            -
                                ):
         | 
| 125 | 
            -
                                    with gr.Row():
         | 
| 126 | 
            -
                                        running_eval_table = gr.components.Dataframe(
         | 
| 127 | 
            -
                                            value=running_eval_queue_df,
         | 
| 128 | 
            -
                                            headers=EVAL_COLS,
         | 
| 129 | 
            -
                                            datatype=EVAL_TYPES,
         | 
| 130 | 
            -
                                            row_count=5,
         | 
| 131 | 
            -
                                        )
         | 
| 132 | 
            -
             | 
| 133 | 
            -
                                with gr.Accordion(
         | 
| 134 | 
            -
                                    f"β³ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
         | 
| 135 | 
            -
                                    open=False,
         | 
| 136 | 
            -
                                ):
         | 
| 137 | 
            -
                                    with gr.Row():
         | 
| 138 | 
            -
                                        pending_eval_table = gr.components.Dataframe(
         | 
| 139 | 
            -
                                            value=pending_eval_queue_df,
         | 
| 140 | 
            -
                                            headers=EVAL_COLS,
         | 
| 141 | 
            -
                                            datatype=EVAL_TYPES,
         | 
| 142 | 
            -
                                            row_count=5,
         | 
| 143 | 
            -
                                        )
         | 
| 144 | 
             
                        with gr.Row():
         | 
| 145 | 
            -
                            gr.Markdown( | 
| 146 | 
            -
             | 
|  | |
| 147 | 
             
                        with gr.Row():
         | 
| 148 | 
            -
                             | 
| 149 | 
            -
             | 
| 150 | 
            -
             | 
| 151 | 
            -
             | 
| 152 | 
            -
             | 
| 153 | 
            -
             | 
| 154 | 
            -
             | 
| 155 | 
            -
             | 
| 156 | 
            -
             | 
| 157 | 
            -
             | 
| 158 | 
            -
             | 
| 159 | 
            -
                             | 
| 160 | 
            -
             | 
| 161 | 
            -
             | 
| 162 | 
            -
             | 
| 163 | 
            -
             | 
| 164 | 
            -
             | 
| 165 | 
            -
             | 
| 166 | 
            -
             | 
| 167 | 
            -
             | 
| 168 | 
            -
             | 
| 169 | 
            -
             | 
| 170 | 
            -
             | 
| 171 | 
            -
             | 
| 172 | 
            -
             | 
| 173 | 
            -
                                 | 
| 174 | 
            -
                                 | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 175 |  | 
| 176 | 
            -
             | 
| 177 | 
            -
                         | 
| 178 | 
            -
                        submit_button.click(
         | 
| 179 | 
            -
                            add_new_eval,
         | 
| 180 | 
            -
                            [
         | 
| 181 | 
            -
                                model_name_textbox,
         | 
| 182 | 
            -
                                base_model_name_textbox,
         | 
| 183 | 
            -
                                revision_name_textbox,
         | 
| 184 | 
            -
                                precision,
         | 
| 185 | 
            -
                                weight_type,
         | 
| 186 | 
            -
                                model_type,
         | 
| 187 | 
            -
                            ],
         | 
| 188 | 
            -
                            submission_result,
         | 
| 189 | 
            -
                        )
         | 
| 190 |  | 
|  | |
| 191 | 
             
                with gr.Row():
         | 
| 192 | 
             
                    with gr.Accordion("π Citation", open=False):
         | 
| 193 | 
             
                        citation_button = gr.Textbox(
         | 
| 194 | 
             
                            value=CITATION_BUTTON_TEXT,
         | 
| 195 | 
             
                            label=CITATION_BUTTON_LABEL,
         | 
| 196 | 
            -
                            lines= | 
| 197 | 
             
                            elem_id="citation-button",
         | 
| 198 | 
             
                            show_copy_button=True,
         | 
| 199 | 
             
                        )
         | 
| 200 |  | 
| 201 | 
            -
             | 
| 202 | 
            -
             | 
| 203 | 
            -
             | 
| 204 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
             
            import gradio as gr
         | 
|  | |
| 2 | 
             
            import pandas as pd
         | 
| 3 | 
            +
            import numpy as np
         | 
|  | |
| 4 |  | 
| 5 | 
             
            from src.about import (
         | 
| 6 | 
             
                CITATION_BUTTON_LABEL,
         | 
|  | |
| 11 | 
             
                TITLE,
         | 
| 12 | 
             
            )
         | 
| 13 | 
             
            from src.display.css_html_js import custom_css
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 14 |  | 
| 15 | 
            +
            def load_speechiq_data():
         | 
| 16 | 
            +
                """Load and process the SpeechIQ results from CSV file."""
         | 
| 17 | 
            +
                try:
         | 
| 18 | 
            +
                    df = pd.read_csv("SpeechIQ_table.csv")
         | 
| 19 | 
            +
                    
         | 
| 20 | 
            +
                    # Round numerical columns to 3 decimal places for better display
         | 
| 21 | 
            +
                    numerical_cols = ['Remember', 'Understand', 'Apply', 'Speech IQ']
         | 
| 22 | 
            +
                    for col in numerical_cols:
         | 
| 23 | 
            +
                        if col in df.columns:
         | 
| 24 | 
            +
                            df[col] = df[col].round(3)
         | 
| 25 | 
            +
                    
         | 
| 26 | 
            +
                    # Sort by Speech IQ score in descending order
         | 
| 27 | 
            +
                    df = df.sort_values('Speech IQ', ascending=False)
         | 
| 28 | 
            +
                    
         | 
| 29 | 
            +
                    return df
         | 
| 30 | 
            +
                except Exception as e:
         | 
| 31 | 
            +
                    print(f"Error loading SpeechIQ data: {e}")
         | 
| 32 | 
            +
                    # Return empty dataframe with expected columns if file not found
         | 
| 33 | 
            +
                    return pd.DataFrame(columns=['Model Type', 'Setup', 'Audio Encoder', 'Remember', 'Understand', 'Apply', 'Speech IQ'])
         | 
| 34 | 
            +
             | 
| 35 | 
            +
            def create_leaderboard_table(df):
         | 
| 36 | 
            +
                """Create a formatted leaderboard table with color coding."""
         | 
| 37 | 
            +
                if df.empty:
         | 
| 38 | 
            +
                    return gr.Dataframe(
         | 
| 39 | 
            +
                        value=df,
         | 
| 40 | 
            +
                        headers=['Model Type', 'Setup', 'Audio Encoder', 'Remember', 'Understand', 'Apply', 'Speech IQ'],
         | 
| 41 | 
            +
                        interactive=False
         | 
| 42 | 
            +
                    )
         | 
| 43 | 
            +
                
         | 
| 44 | 
            +
                return gr.Dataframe(
         | 
| 45 | 
            +
                    value=df,
         | 
| 46 | 
            +
                    headers=df.columns.tolist(),
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 47 | 
             
                    interactive=False,
         | 
| 48 | 
            +
                    wrap=True,
         | 
| 49 | 
            +
                    column_widths=["15%", "25%", "15%", "11%", "11%", "11%", "12%"],
         | 
| 50 | 
            +
                    height=600
         | 
| 51 | 
             
                )
         | 
| 52 |  | 
| 53 | 
            +
            def get_top_performers(df):
         | 
| 54 | 
            +
                """Get statistics about top performers."""
         | 
| 55 | 
            +
                if df.empty:
         | 
| 56 | 
            +
                    return "No data available."
         | 
| 57 | 
            +
                
         | 
| 58 | 
            +
                top_score = df['Speech IQ'].max()
         | 
| 59 | 
            +
                top_model = df.loc[df['Speech IQ'].idxmax()]
         | 
| 60 | 
            +
                
         | 
| 61 | 
            +
                agentic_best = df[df['Model Type'].str.contains('Agentic', na=False)]['Speech IQ'].max() if not df[df['Model Type'].str.contains('Agentic', na=False)].empty else 0
         | 
| 62 | 
            +
                end2end_best = df[df['Model Type'].str.contains('End2End', na=False)]['Speech IQ'].max() if not df[df['Model Type'].str.contains('End2End', na=False)].empty else 0
         | 
| 63 | 
            +
                
         | 
| 64 | 
            +
                stats_text = f"""
         | 
| 65 | 
            +
                ### π Leaderboard Statistics
         | 
| 66 | 
            +
                
         | 
| 67 | 
            +
                **π Top Performer:** {top_model['Setup']} (Score: {top_score})
         | 
| 68 | 
            +
                
         | 
| 69 | 
            +
                **π€ Best Agentic Model:** {agentic_best}
         | 
| 70 | 
            +
                
         | 
| 71 | 
            +
                **π Best End2End Model:** {end2end_best}
         | 
| 72 | 
            +
                
         | 
| 73 | 
            +
                **π Total Models Evaluated:** {len(df)}
         | 
| 74 | 
            +
                """
         | 
| 75 | 
            +
                
         | 
| 76 | 
            +
                return stats_text
         | 
| 77 | 
            +
             | 
| 78 | 
            +
            # Load the data
         | 
| 79 | 
            +
            speechiq_df = load_speechiq_data()
         | 
| 80 | 
            +
             | 
| 81 | 
            +
            # Create the Gradio interface
         | 
| 82 | 
            +
            demo = gr.Blocks(css=custom_css, title="SpeechIQ Leaderboard")
         | 
| 83 |  | 
|  | |
| 84 | 
             
            with demo:
         | 
| 85 | 
             
                gr.HTML(TITLE)
         | 
| 86 | 
             
                gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
         | 
| 87 |  | 
| 88 | 
             
                with gr.Tabs(elem_classes="tab-buttons") as tabs:
         | 
| 89 | 
            +
                    with gr.TabItem("π
 SpeechIQ Leaderboard", elem_id="speechiq-leaderboard-tab", id=0):
         | 
| 90 | 
            +
                        
         | 
| 91 | 
            +
                        # Statistics section
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 92 | 
             
                        with gr.Row():
         | 
| 93 | 
            +
                            gr.Markdown(get_top_performers(speechiq_df), elem_classes="markdown-text")
         | 
| 94 | 
            +
                        
         | 
| 95 | 
            +
                        # Main leaderboard table
         | 
| 96 | 
             
                        with gr.Row():
         | 
| 97 | 
            +
                            leaderboard_table = create_leaderboard_table(speechiq_df)
         | 
| 98 | 
            +
                        
         | 
| 99 | 
            +
                        # Legend and explanation
         | 
| 100 | 
            +
                        with gr.Row():
         | 
| 101 | 
            +
                            gr.Markdown("""
         | 
| 102 | 
            +
                            ### π Column Explanations
         | 
| 103 | 
            +
                            
         | 
| 104 | 
            +
                            - **Model Type**: Architecture approach (Agentic vs End2End)
         | 
| 105 | 
            +
                            - **Setup**: Specific model configuration and components
         | 
| 106 | 
            +
                            - **Audio Encoder**: The audio processing component used
         | 
| 107 | 
            +
                            - **Remember**: Verbatim accuracy score (WER-based)
         | 
| 108 | 
            +
                            - **Understand**: Semantic interpretation similarity score
         | 
| 109 | 
            +
                            - **Apply**: Downstream task performance score
         | 
| 110 | 
            +
                            - **Speech IQ**: Overall intelligence quotient combining all dimensions
         | 
| 111 | 
            +
                            
         | 
| 112 | 
            +
                            *Higher scores indicate better performance across all metrics.*
         | 
| 113 | 
            +
                            """, elem_classes="markdown-text")
         | 
| 114 | 
            +
             | 
| 115 | 
            +
                    with gr.TabItem("π Analysis", elem_id="analysis-tab", id=1):
         | 
| 116 | 
            +
                        with gr.Row():
         | 
| 117 | 
            +
                            # Create performance comparison charts
         | 
| 118 | 
            +
                            if not speechiq_df.empty:
         | 
| 119 | 
            +
                                # Group by model type for comparison
         | 
| 120 | 
            +
                                agentic_models = speechiq_df[speechiq_df['Model Type'].str.contains('Agentic', na=False)]
         | 
| 121 | 
            +
                                end2end_models = speechiq_df[speechiq_df['Model Type'].str.contains('End2End', na=False)]
         | 
| 122 | 
            +
                                
         | 
| 123 | 
            +
                                comparison_text = f"""
         | 
| 124 | 
            +
                                ### π Model Type Comparison
         | 
| 125 | 
            +
                                
         | 
| 126 | 
            +
                                **Agentic Models (ASR + LLM):**
         | 
| 127 | 
            +
                                - Count: {len(agentic_models)}
         | 
| 128 | 
            +
                                - Average Speech IQ: {agentic_models['Speech IQ'].mean():.2f}
         | 
| 129 | 
            +
                                - Best Score: {agentic_models['Speech IQ'].max():.2f}
         | 
| 130 | 
            +
                                
         | 
| 131 | 
            +
                                **End-to-End Models:**
         | 
| 132 | 
            +
                                - Count: {len(end2end_models)}
         | 
| 133 | 
            +
                                - Average Speech IQ: {end2end_models['Speech IQ'].mean():.2f}
         | 
| 134 | 
            +
                                - Best Score: {end2end_models['Speech IQ'].max():.2f}
         | 
| 135 | 
            +
                                
         | 
| 136 | 
            +
                                ### π― Cognitive Dimension Analysis
         | 
| 137 | 
            +
                                
         | 
| 138 | 
            +
                                **Remember (Verbatim Accuracy):**
         | 
| 139 | 
            +
                                - Best performer: {speechiq_df.loc[speechiq_df['Remember'].idxmax(), 'Setup']} ({speechiq_df['Remember'].max():.3f})
         | 
| 140 | 
            +
                                
         | 
| 141 | 
            +
                                **Understand (Semantic Similarity):**
         | 
| 142 | 
            +
                                - Best performer: {speechiq_df.loc[speechiq_df['Understand'].idxmax(), 'Setup']} ({speechiq_df['Understand'].max():.3f})
         | 
| 143 | 
            +
                                
         | 
| 144 | 
            +
                                **Apply (Task Performance):**
         | 
| 145 | 
            +
                                - Best performer: {speechiq_df.loc[speechiq_df['Apply'].idxmax(), 'Setup']} ({speechiq_df['Apply'].max():.3f})
         | 
| 146 | 
            +
                                """
         | 
| 147 | 
            +
                                
         | 
| 148 | 
            +
                                gr.Markdown(comparison_text, elem_classes="markdown-text")
         | 
| 149 | 
            +
                            else:
         | 
| 150 | 
            +
                                gr.Markdown("No data available for analysis.", elem_classes="markdown-text")
         | 
| 151 | 
            +
             | 
| 152 | 
            +
                    with gr.TabItem("π About", elem_id="about-tab", id=2):
         | 
| 153 | 
            +
                        gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
         | 
| 154 |  | 
| 155 | 
            +
                    with gr.TabItem("π Submit", elem_id="submit-tab", id=3):
         | 
| 156 | 
            +
                        gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 157 |  | 
| 158 | 
            +
                # Citation section
         | 
| 159 | 
             
                with gr.Row():
         | 
| 160 | 
             
                    with gr.Accordion("π Citation", open=False):
         | 
| 161 | 
             
                        citation_button = gr.Textbox(
         | 
| 162 | 
             
                            value=CITATION_BUTTON_TEXT,
         | 
| 163 | 
             
                            label=CITATION_BUTTON_LABEL,
         | 
| 164 | 
            +
                            lines=6,
         | 
| 165 | 
             
                            elem_id="citation-button",
         | 
| 166 | 
             
                            show_copy_button=True,
         | 
| 167 | 
             
                        )
         | 
| 168 |  | 
| 169 | 
            +
                # Add refresh functionality
         | 
| 170 | 
            +
                with gr.Row():
         | 
| 171 | 
            +
                    refresh_button = gr.Button("π Refresh Data", variant="secondary")
         | 
| 172 | 
            +
                    
         | 
| 173 | 
            +
                    def refresh_data():
         | 
| 174 | 
            +
                        updated_df = load_speechiq_data()
         | 
| 175 | 
            +
                        return create_leaderboard_table(updated_df), get_top_performers(updated_df)
         | 
| 176 | 
            +
                    
         | 
| 177 | 
            +
                    refresh_button.click(
         | 
| 178 | 
            +
                        refresh_data,
         | 
| 179 | 
            +
                        outputs=[leaderboard_table, gr.Markdown()]
         | 
| 180 | 
            +
                    )
         | 
| 181 | 
            +
             | 
| 182 | 
            +
            if __name__ == "__main__":
         | 
| 183 | 
            +
                demo.launch(share=False, server_name="0.0.0.0", server_port=7860)
         | 
    	
        src/about.py
    CHANGED
    
    | @@ -1,72 +1,96 @@ | |
| 1 | 
             
            from dataclasses import dataclass
         | 
| 2 | 
             
            from enum import Enum
         | 
| 3 |  | 
| 4 | 
            -
             | 
| 5 | 
            -
             | 
| 6 | 
            -
                benchmark: str
         | 
| 7 | 
            -
                metric: str
         | 
| 8 | 
            -
                col_name: str
         | 
| 9 | 
            -
             | 
| 10 | 
            -
             | 
| 11 | 
            -
            # Select your tasks here
         | 
| 12 | 
            -
            # ---------------------------------------------------
         | 
| 13 | 
            -
            class Tasks(Enum):
         | 
| 14 | 
            -
                # task_key in the json file, metric_key in the json file, name to display in the leaderboard 
         | 
| 15 | 
            -
                task0 = Task("anli_r1", "acc", "ANLI")
         | 
| 16 | 
            -
                task1 = Task("logiqa", "acc_norm", "LogiQA")
         | 
| 17 |  | 
| 18 | 
            -
             | 
| 19 | 
            -
             | 
|  | |
| 20 |  | 
|  | |
| 21 |  | 
|  | |
| 22 |  | 
| 23 | 
            -
             | 
| 24 | 
            -
             | 
|  | |
| 25 |  | 
| 26 | 
            -
             | 
| 27 | 
            -
            INTRODUCTION_TEXT = """
         | 
| 28 | 
            -
            Intro text
         | 
| 29 | 
             
            """
         | 
| 30 |  | 
| 31 | 
             
            # Which evaluations are you running? how can people reproduce what you have?
         | 
| 32 | 
            -
            LLM_BENCHMARKS_TEXT =  | 
| 33 | 
            -
            ##  | 
|  | |
|  | |
|  | |
|  | |
| 34 |  | 
| 35 | 
            -
             | 
| 36 | 
            -
            To reproduce our results, here is the commands you can run:
         | 
| 37 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 38 | 
             
            """
         | 
| 39 |  | 
| 40 | 
             
            EVALUATION_QUEUE_TEXT = """
         | 
| 41 | 
            -
            ##  | 
| 42 |  | 
| 43 | 
            -
             | 
| 44 | 
            -
            ```python
         | 
| 45 | 
            -
            from transformers import AutoConfig, AutoModel, AutoTokenizer
         | 
| 46 | 
            -
            config = AutoConfig.from_pretrained("your model name", revision=revision)
         | 
| 47 | 
            -
            model = AutoModel.from_pretrained("your model name", revision=revision)
         | 
| 48 | 
            -
            tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
         | 
| 49 | 
            -
            ```
         | 
| 50 | 
            -
            If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
         | 
| 51 |  | 
| 52 | 
            -
             | 
| 53 | 
            -
             | 
|  | |
|  | |
| 54 |  | 
| 55 | 
            -
            ### 2)  | 
| 56 | 
            -
             | 
|  | |
|  | |
| 57 |  | 
| 58 | 
            -
            ### 3)  | 
| 59 | 
            -
             | 
|  | |
|  | |
|  | |
| 60 |  | 
| 61 | 
            -
            ### 4)  | 
| 62 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
| 63 |  | 
| 64 | 
            -
            ##  | 
| 65 | 
            -
             | 
| 66 | 
            -
             | 
| 67 | 
            -
            If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
         | 
| 68 | 
             
            """
         | 
| 69 |  | 
| 70 | 
             
            CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
         | 
| 71 | 
            -
            CITATION_BUTTON_TEXT = r"""
         | 
| 72 | 
            -
             | 
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
             
            from dataclasses import dataclass
         | 
| 2 | 
             
            from enum import Enum
         | 
| 3 |  | 
| 4 | 
            +
            # Your leaderboard name
         | 
| 5 | 
            +
            TITLE = """<h1 align="center" id="space-title">ποΈ Speech Intelligence Quotient (SpeechIQ) Leaderboard</h1>"""
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 6 |  | 
| 7 | 
            +
            # What does your leaderboard evaluate?
         | 
| 8 | 
            +
            INTRODUCTION_TEXT = """
         | 
| 9 | 
            +
            ## π― Welcome to the Speech Intelligence Quotient (SpeechIQ) Leaderboard!
         | 
| 10 |  | 
| 11 | 
            +
            This leaderboard presents evaluation results for voice understanding large language models (LLM<sub>Voice</sub>) using our novel SpeechIQ evaluation framework.
         | 
| 12 |  | 
| 13 | 
            +
            **SpeechIQ** is a human cognition-inspired evaluation pipeline that assesses voice understanding abilities across three cognitive levels based on Bloom's Taxonomy:
         | 
| 14 |  | 
| 15 | 
            +
            - **π§  Remembering**: Verbatim accuracy (WER-based)
         | 
| 16 | 
            +
            - **π‘ Understanding**: Similarity of LLM's interpretations  
         | 
| 17 | 
            +
            - **π Application**: QA accuracy for downstream tasks
         | 
| 18 |  | 
| 19 | 
            +
            The **Speech IQ Score** provides a unified metric for comparing both cascaded methods (ASR+LLM) and end-to-end models.
         | 
|  | |
|  | |
| 20 | 
             
            """
         | 
| 21 |  | 
| 22 | 
             
            # Which evaluations are you running? how can people reproduce what you have?
         | 
| 23 | 
            +
            LLM_BENCHMARKS_TEXT = """
         | 
| 24 | 
            +
            ## π About SpeechIQ Evaluation
         | 
| 25 | 
            +
             | 
| 26 | 
            +
            **Speech Intelligence Quotient (SpeechIQ)** represents a first-of-its-kind intelligence examination that bridges cognitive principles with voice-oriented benchmarks. Our framework moves beyond traditional metrics like Word Error Rate (WER) to provide comprehensive evaluation of voice understanding capabilities.
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            ### π― Evaluation Framework
         | 
| 29 |  | 
| 30 | 
            +
            SpeechIQ evaluates models across three cognitive dimensions inspired by Bloom's Taxonomy:
         | 
|  | |
| 31 |  | 
| 32 | 
            +
            1. **Remember** (Verbatim Accuracy): Tests the model's ability to accurately capture spoken content
         | 
| 33 | 
            +
            2. **Understand** (Interpretation Similarity): Evaluates how well the model comprehends the meaning of speech
         | 
| 34 | 
            +
            3. **Apply** (Downstream Performance): Measures the model's ability to use speech understanding for practical tasks
         | 
| 35 | 
            +
             | 
| 36 | 
            +
            ### π Model Categories
         | 
| 37 | 
            +
             | 
| 38 | 
            +
            - **Agentic (ASR + LLM)**: Cascaded approaches using separate ASR and LLM components
         | 
| 39 | 
            +
            - **End2End**: Direct speech-to-text models that process audio end-to-end
         | 
| 40 | 
            +
             | 
| 41 | 
            +
            ### π¬ Key Benefits
         | 
| 42 | 
            +
             | 
| 43 | 
            +
            - **Unified Comparisons**: Compare cascaded and end-to-end approaches on equal footing
         | 
| 44 | 
            +
            - **Error Detection**: Identify annotation errors in existing benchmarks
         | 
| 45 | 
            +
            - **Hallucination Detection**: Detect and quantify hallucinations in voice LLMs
         | 
| 46 | 
            +
            - **Cognitive Assessment**: Map model capabilities to human cognitive principles
         | 
| 47 | 
            +
             | 
| 48 | 
            +
            ### π Speech IQ Score
         | 
| 49 | 
            +
             | 
| 50 | 
            +
            The final Speech IQ Score combines performance across all three dimensions to provide a comprehensive measure of voice understanding intelligence.
         | 
| 51 | 
            +
             | 
| 52 | 
            +
            ## π Reproducibility
         | 
| 53 | 
            +
             | 
| 54 | 
            +
            For detailed methodology and reproduction instructions, please refer to our paper and codebase.
         | 
| 55 | 
             
            """
         | 
| 56 |  | 
| 57 | 
             
            EVALUATION_QUEUE_TEXT = """
         | 
| 58 | 
            +
            ## π Submit Your Model for SpeechIQ Evaluation
         | 
| 59 |  | 
| 60 | 
            +
            To submit your voice understanding model for SpeechIQ evaluation:
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 61 |  | 
| 62 | 
            +
            ### 1) Ensure Model Compatibility
         | 
| 63 | 
            +
            Make sure your model can process audio inputs and generate text outputs in one of these formats:
         | 
| 64 | 
            +
            - **ASR + LLM**: Separate ASR and LLM components
         | 
| 65 | 
            +
            - **End-to-End**: Direct audio-to-text processing
         | 
| 66 |  | 
| 67 | 
            +
            ### 2) Model Requirements
         | 
| 68 | 
            +
            - Model must be publicly accessible
         | 
| 69 | 
            +
            - Provide clear documentation of audio input format and expected outputs
         | 
| 70 | 
            +
            - Include information about audio encoder specifications
         | 
| 71 |  | 
| 72 | 
            +
            ### 3) Evaluation Domains
         | 
| 73 | 
            +
            Your model will be evaluated across:
         | 
| 74 | 
            +
            - **Remember**: Transcription accuracy
         | 
| 75 | 
            +
            - **Understand**: Semantic understanding
         | 
| 76 | 
            +
            - **Apply**: Task-specific performance
         | 
| 77 |  | 
| 78 | 
            +
            ### 4) Documentation
         | 
| 79 | 
            +
            Please provide:
         | 
| 80 | 
            +
            - Model architecture details
         | 
| 81 | 
            +
            - Training data information
         | 
| 82 | 
            +
            - Audio preprocessing requirements
         | 
| 83 | 
            +
            - Expected input/output formats
         | 
| 84 |  | 
| 85 | 
            +
            ## π§ Contact
         | 
| 86 | 
            +
             | 
| 87 | 
            +
            For questions about SpeechIQ evaluation or to submit your model, please contact the research team.
         | 
|  | |
| 88 | 
             
            """
         | 
| 89 |  | 
| 90 | 
             
            CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
         | 
| 91 | 
            +
            CITATION_BUTTON_TEXT = r"""@article{speechiq2024,
         | 
| 92 | 
            +
              title={Speech Intelligence Quotient (SpeechIQ): A Human Cognition-Inspired Evaluation Framework for Voice Understanding Large Language Models},
         | 
| 93 | 
            +
              author={[Authors]},
         | 
| 94 | 
            +
              journal={[Journal/Conference]},
         | 
| 95 | 
            +
              year={2024}
         | 
| 96 | 
            +
            }"""
         | 

