Spaces:
Runtime error
Runtime error
krishnapal2308
Merge branch 'main' of https://huggingface.co/spaces/krishnapal2308/eye_for_blind
49e2829
| import tempfile | |
| import gradio as gr | |
| from gtts import gTTS | |
| import inference_script | |
| import vit_gpt2 | |
| import os | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| # Define problem statement | |
| problem_statement = """ | |
| ### Problem Statement | |
| This project aims to develop a deep learning model to verbally describe image contents for the visually impaired using caption generation with an attention mechanism on the Flickr8K dataset. Inspired by the "Show, Attend and Tell: Neural Image Caption Generation with Visual Attention" paper, the model utilizes a CNN-RNN architecture to extract image features and generate captions, facilitating accessibility. The Kaggle dataset comprises 8,000 images, each paired with five descriptive captions, enabling comprehensive understanding of image content. | |
| """ | |
| # Define solution overview | |
| solution_overview = """ | |
| ### Solution Overview | |
| The basic model, **trained for only 20 epochs without extensive hyperparameter tuning,** primarily focuses on exploring the integration of the attention mechanism with the Encoder-Decoder architecture for image processing utilizing subclassing. To improve inference quality, Vit-GPT2 architecture is integrated. [Visit the Kaggle notebook](https://www.kaggle.com/code/krishna2308/eye-for-blind) for implementation details. | |
| """ | |
| # Define real-life scenario application | |
| real_life_scenario = """ | |
| ### Real-life Scenario Application | |
| While this current implementation may not support real-time processing, the potential for future development is vast. Where a visually impaired individual wears smart glasses equipped with a camera. As they move around, the camera captures live footage of their surroundings, which is then processed in real-time by the image captioning model integrated into the glasses. The generated spoken descriptions can be streamed directly to the user's earpiece, providing instant audio feedback about their environment. | |
| """ | |
| def process_image_and_generate_output(image, model_selection): | |
| if image is None: | |
| return "Please select an image", None | |
| if model_selection == "Basic Model (Results won't be good)": | |
| result = inference_script.evaluate(image) | |
| pred_caption = ' '.join(result).rsplit(' ', 1)[0] | |
| pred_caption = pred_caption.replace('<unk>', '') | |
| elif model_selection == 'ViT-GPT2': | |
| result = vit_gpt2.predict_step(image) | |
| pred_caption = result[0] | |
| else: | |
| return "Invalid model selection", None | |
| # Generate speech from the caption | |
| tts = gTTS(text=pred_caption, lang='en', slow=False) | |
| with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as temp_audio: | |
| audio_file_path = temp_audio.name | |
| tts.save(audio_file_path) | |
| # Read the audio file | |
| with open(audio_file_path, "rb") as f: | |
| audio_content = f.read() | |
| # Clean up the temporary audio file | |
| os.unlink(audio_file_path) | |
| return pred_caption, audio_content | |
| sample_images = [ | |
| [os.path.join(os.path.dirname(__file__), "sample_images/1.jpg"), "ViT-GPT2"], | |
| [os.path.join(os.path.dirname(__file__), "sample_images/1.jpg"), "Basic Model (Results won't be good)"], | |
| [os.path.join(os.path.dirname(__file__), "sample_images/3.jpg"), "ViT-GPT2"], | |
| [os.path.join(os.path.dirname(__file__), "sample_images/3.jpg"), "Basic Model (Results won't be good)"] | |
| ] | |
| # Create a dropdown to select sample image | |
| image_input = gr.Image(label="Upload Image") | |
| # Create a dropdown to choose the model | |
| model_selection_input = gr.Radio(["Basic Model (Results won't be good)", | |
| "ViT-GPT2"], | |
| label="Choose Model") | |
| iface = gr.Interface(fn=process_image_and_generate_output, | |
| inputs=[image_input, model_selection_input], | |
| outputs=[gr.Text(label="Caption"),gr.Audio(label="Audio")], | |
| examples=sample_images, | |
| allow_flagging='never', | |
| title="Eye For Blind | Image Captioning & TTS Demo", | |
| description=f"{problem_statement}\n\n{solution_overview}\n\n{real_life_scenario}") | |
| iface.launch() | |