import gradio as gr import soundfile as sf from transformers import AutoProcessor, pipeline from optimum.intel.openvino import OVModelForSpeechSeq2Seq # Load model + processor model_id = "distil-whisper/distil-large-v2" processor = AutoProcessor.from_pretrained(model_id) ov_model = OVModelForSpeechSeq2Seq.from_pretrained(model_id, export=True) ov_model.generation_config.max_new_tokens = 128 # Create HF pipeline pipe = pipeline( "automatic-speech-recognition", model=ov_model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, chunk_length_s=15, batch_size=16, ) # Transcription function def transcribe(audio): audio_array, sampling_rate = sf.read(audio) result = pipe(audio_array) return result["text"] # Launch Gradio UI gr.Interface( fn=transcribe, inputs=gr.Audio(type="filepath"), outputs="text", title="🧠 Distil-Whisper + OpenVINO ASR", description="Upload audio to transcribe using Distil-Whisper accelerated with Intel OpenVINO.", ).launch()