import gradio as gr
import soundfile as sf
from transformers import AutoProcessor, pipeline
from optimum.intel.openvino import OVModelForSpeechSeq2Seq

# Load model + processor
model_id = "distil-whisper/distil-large-v2"
processor = AutoProcessor.from_pretrained(model_id)
ov_model = OVModelForSpeechSeq2Seq.from_pretrained(model_id, export=True)
ov_model.generation_config.max_new_tokens = 128

# Create HF pipeline
pipe = pipeline(
    "automatic-speech-recognition",
    model=ov_model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    chunk_length_s=15,
    batch_size=16,
)

# Transcription function
def transcribe(audio):
    audio_array, sampling_rate = sf.read(audio)
    result = pipe(audio_array)
    return result["text"]

# Launch Gradio UI
gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(type="filepath"),
    outputs="text",
    title="🧠 Distil-Whisper + OpenVINO ASR",
    description="Upload audio to transcribe using Distil-Whisper accelerated with Intel OpenVINO.",
).launch()