Apedlop commited on
Commit
5bb262a
·
1 Parent(s): 3572569
.env.plantilla ADDED
@@ -0,0 +1 @@
 
 
1
+ TOKEN=""
.gitignore CHANGED
@@ -1 +1,2 @@
1
- venv_prueba/
 
 
1
+ venv_prueba/
2
+ .env
diffusers_disc.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import soundfile as sf
4
+ from PIL import Image
5
+ from transformers import pipeline
6
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
7
+ from datasets import load_dataset
8
+
9
+ # -------------------------
10
+ # MODELO IMAGEN -> TEXTO
11
+ # -------------------------
12
+ modeloObtenerTextoImagen = pipeline(
13
+ "image-to-text",
14
+ model="Salesforce/blip-image-captioning-base"
15
+ )
16
+
17
+ # -------------------------
18
+ # MODELO TEXTO -> AUDIO
19
+ # -------------------------
20
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
21
+ modeloTextoAudio = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
22
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
23
+
24
+ # Voz base
25
+ dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
26
+ speaker_embeddings = torch.tensor(dataset[0]["xvector"]).unsqueeze(0)
27
+
28
+ # -------------------------
29
+ # FUNCIÓN PRINCIPAL
30
+ # -------------------------
31
+ def obtenerDescripcionAudio(imagen):
32
+ # Imagen -> Texto
33
+ resultadoModeloTI = modeloObtenerTextoImagen(Image.fromarray(imagen))
34
+ texto_generado = resultadoModeloTI[0]["generated_text"]
35
+
36
+ print(f"La frase obtenida de la imagen es: {texto_generado}")
37
+
38
+ # Texto -> Audio
39
+ inputs = processor(text=texto_generado, return_tensors="pt")
40
+ audio = modeloTextoAudio.generate_speech(
41
+ inputs["input_ids"],
42
+ speaker_embeddings,
43
+ vocoder=vocoder
44
+ )
45
+
46
+ ruta_audio = "audio_salida.wav"
47
+ sf.write(ruta_audio, audio.numpy(), samplerate=16000)
48
+
49
+ return texto_generado, ruta_audio
50
+
51
+ # -------------------------
52
+ # INTERFAZ GRADIO
53
+ # -------------------------
54
+ demo = gr.Interface(
55
+ fn=obtenerDescripcionAudio,
56
+ inputs=gr.Image(label="📷 Sube una imagen"),
57
+ outputs=[
58
+ gr.Textbox(label="📝 Texto generado"),
59
+ gr.Audio(label="🔊 Audio generado", type="filepath")
60
+ ],
61
+ title="Asistente Visual Accesible",
62
+ description="Sube una imagen y la aplicación describe lo que ve y lo lee en voz alta."
63
+ )
64
+
65
+ demo.launch(share=True)
app4.py → diffusers_empr.py RENAMED
File without changes
app3.py → inf_prov_disc.py RENAMED
@@ -1,9 +1,13 @@
1
  import os
 
2
  import requests
3
 
 
 
4
  API_URL = "https://router.huggingface.co/hf-inference/models/philschmid/bart-large-cnn-samsum"
 
5
  headers = {
6
- "Authorization": f"Bearer [token here]",
7
  }
8
 
9
  def query(payload):
 
1
  import os
2
+ from dotenv import load_dotenv
3
  import requests
4
 
5
+ load_dotenv()
6
+
7
  API_URL = "https://router.huggingface.co/hf-inference/models/philschmid/bart-large-cnn-samsum"
8
+ TOKEN = os.getenv("TOKEN")
9
  headers = {
10
+ "Authorization": f"Bearer {TOKEN}",
11
  }
12
 
13
  def query(payload):
inf_prov_empr.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import InferenceClient
2
+ from dotenv import load_dotenv
3
+ import os
4
+
5
+ load_dotenv()
6
+
7
+ client = InferenceClient(
8
+ provider="hf-inference",
9
+ api_key=os.getenv("TOKEN"),
10
+ )
11
+
12
+ output = client.image_segmentation(
13
+ "https://s1.ppllstatics.com/mujerhoy/www/multimedia/202306/02/media/cortadas/[email protected]",
14
+ model="jonathandinu/face-parsing"
15
+ )
16
+
17
+ print(output)
app1.py → transformer_disc.py RENAMED
File without changes
app2.py → transformer_empr.py RENAMED
File without changes