|
|
import gradio as gr |
|
|
import soundfile as sf |
|
|
from transformers import AutoProcessor, pipeline |
|
|
from optimum.intel.openvino import OVModelForSpeechSeq2Seq |
|
|
|
|
|
|
|
|
model_id = "distil-whisper/distil-large-v2" |
|
|
processor = AutoProcessor.from_pretrained(model_id) |
|
|
ov_model = OVModelForSpeechSeq2Seq.from_pretrained(model_id, export=True) |
|
|
ov_model.generation_config.max_new_tokens = 128 |
|
|
|
|
|
|
|
|
pipe = pipeline( |
|
|
"automatic-speech-recognition", |
|
|
model=ov_model, |
|
|
tokenizer=processor.tokenizer, |
|
|
feature_extractor=processor.feature_extractor, |
|
|
chunk_length_s=15, |
|
|
batch_size=16, |
|
|
) |
|
|
|
|
|
|
|
|
def transcribe(audio): |
|
|
audio_array, sampling_rate = sf.read(audio) |
|
|
result = pipe(audio_array) |
|
|
return result["text"] |
|
|
|
|
|
|
|
|
gr.Interface( |
|
|
fn=transcribe, |
|
|
inputs=gr.Audio(type="filepath"), |
|
|
outputs="text", |
|
|
title="🧠 Distil-Whisper + OpenVINO ASR", |
|
|
description="Upload audio to transcribe using Distil-Whisper accelerated with Intel OpenVINO.", |
|
|
).launch() |
|
|
|