Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import pipeline | |
| from transformers import BlipForQuestionAnswering | |
| from transformers.utils import logging | |
| logging.set_verbosity_error() | |
| from transformers import AutoProcessor | |
| tts_pipe = pipeline("text-to-speech", | |
| model="kakao-enterprise/vits-ljs") | |
| model = BlipForQuestionAnswering.from_pretrained( | |
| "Salesforce/blip-vqa-base") | |
| processor = AutoProcessor.from_pretrained( | |
| "Salesforce/blip-vqa-base") | |
| def get_pipeline_prediction(pil_image, question): | |
| inputs = processor(pil_image, question, return_tensors="pt") | |
| out = model.generate(**inputs) | |
| text = processor.decode(out[0], skip_special_tokens=True) | |
| narrated_text = tts_pipe(text) | |
| return (narrated_text["sampling_rate"], narrated_text["audio"][0] ) | |
| demo = gr.Interface( | |
| fn=get_pipeline_prediction, | |
| inputs=[gr.Image(label="Input image", | |
| type="pil"), gr.Textbox(label="Ask your question")], | |
| outputs=gr.Audio(label="Narration", type="numpy", autoplay=True) | |
| ) | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |