Spaces:

mustafa2ak
/

clip_

Runtime error

App Files Files Community

Mustafa Acikgoz commited on Jul 20

Commit

8818841

1 Parent(s): 296fb5d

Fix: Correct image_encoder attribute and prevent startup timeout

Browse files

Files changed (2) hide show

app.py +33 -39
inference_model.py +30 -9

app.py CHANGED Viewed

@@ -10,11 +10,10 @@ import glob
 from tqdm import tqdm
 # --- Custom Modules ---
-# These imports assume your config.py and model files are in the same directory
 import config
 from inference_model import CLIPModel
-# --- 1. Initial Setup: Load Model and Tokenizer (runs once on startup) ---
 print("Starting application setup...")
 device = config.DEVICE
@@ -32,35 +31,34 @@ try:
     print("CLIP Model loaded successfully.")
 except Exception as e:
     print(f"Error loading model: {e}")
-    model = None # Set model to None if loading fails
 # Load the text tokenizer
 tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
 print("Tokenizer loaded successfully.")
-# --- 2. Data Handling: Download and Pre-process Images (runs once on startup) ---
-# This is the key section that connects your app to your image dataset.
-# Define the dataset repository on the Hugging Face Hub
 DATASET_REPO_ID = "mustafa2ak/Flickr8k-Images"
-# Define the local folder where the images will be stored inside the Space
 IMAGE_STORAGE_PATH = "./flickr8k_images"
 print(f"Downloading image dataset from {DATASET_REPO_ID}...")
-# Use snapshot_download for a fast, server-to-server transfer
 snapshot_download(
     repo_id=DATASET_REPO_ID,
     repo_type="dataset",
     local_dir=IMAGE_STORAGE_PATH,
-    local_dir_use_symlinks=False # Important for compatibility
 )
 print("Image dataset download complete.")
-# Get a list of all image file paths from the downloaded folder
-# **CORRECTION**: The dataset structure has images directly in 'Flicker8k_Dataset'
-# The original code was looking for a subfolder named 'images', which doesn't exist.
 all_image_paths = glob.glob(os.path.join(IMAGE_STORAGE_PATH, "Flicker8k_Dataset", "*.jpg"))
-print(f"Found {len(all_image_paths)} images.")
 # Define the image preprocessing pipeline
 image_transform = transforms.Compose([
@@ -70,43 +68,39 @@ image_transform = transforms.Compose([
 ])
 def precompute_image_embeddings(image_paths, model, transform, device):
-    """
-    Processes all images and computes their embeddings for fast searching.
-    This is a crucial optimization.
-    """
-    print("Pre-computing image embeddings... This may take a few minutes.")
     all_embeddings = []
-    # torch.no_grad() disables gradient calculation, making this much faster
     with torch.no_grad():
-        # tqdm creates a progress bar in your logs
         for path in tqdm(image_paths, desc="Processing Images"):
             try:
                 image = Image.open(path).convert("RGB")
                 image_tensor = transform(image).unsqueeze(0).to(device)
-                # Pass the image through the model's image encoder part
-                embedding = model.image_encoder(image_tensor)
                 all_embeddings.append(embedding)
             except Exception as e:
                 print(f"Warning: Could not process image {path}. Error: {e}")
                 continue
-    # Combine the list of individual tensors into one large tensor
     return torch.cat(all_embeddings, dim=0)
 # Pre-compute all image embeddings and store them in memory
 if model and all_image_paths:
     image_embeddings_precomputed = precompute_image_embeddings(all_image_paths, model, image_transform, device)
-    # Normalize the embeddings once for faster similarity calculation later
     image_embeddings_precomputed = F.normalize(image_embeddings_precomputed, p=2, dim=-1)
     print("Image embeddings pre-computed and stored.")
 else:
     image_embeddings_precomputed = None
     print("Skipping embedding pre-computation due to missing model or images.")
 # --- 3. The Main Gradio Function for Text-to-Image Search ---
 def find_image_from_text(text_query):
-    """
-    Takes a text query and finds the best matching image from the pre-computed embeddings.
-    """
     if not text_query:
         return None, "Please enter a text query."
     if image_embeddings_precomputed is None:
@@ -114,31 +108,32 @@ def find_image_from_text(text_query):
     print(f"Searching for text: '{text_query}'")
     with torch.no_grad():
-        # 1. Process the text query into tokens and get its embedding
         text_inputs = tokenizer([text_query], padding=True, truncation=True, return_tensors="pt").to(device)
-        text_embedding = model.text_encoder(
-            input_ids=text_inputs['input_ids'],
             attention_mask=text_inputs['attention_mask']
         )
-        # 2. Normalize the text embedding
         text_embedding_norm = F.normalize(text_embedding, p=2, dim=-1)
-        # 3. Calculate similarity against all pre-computed image embeddings
-        # This is a fast matrix multiplication: (1, 512) @ (512, N_images) -> (1, N_images)
         similarity_scores = (text_embedding_norm @ image_embeddings_precomputed.T).squeeze(0)
-        # 4. Find the index of the image with the highest score
         best_image_index = similarity_scores.argmax().item()
-        # 5. Get the file path of the best image
         best_image_path = all_image_paths[best_image_index]
         best_score = similarity_scores[best_image_index].item()
         print(f"Found best match: {best_image_path} with score {best_score:.4f}")
-        # Return the path to the best image and a caption for the UI
         return best_image_path, f"Best match with score: {best_score:.4f}"
 # --- 4. Create and Launch the Gradio Interface ---
 iface = gr.Interface(
     fn=find_image_from_text,
@@ -148,9 +143,8 @@ iface = gr.Interface(
         gr.Textbox(label="Result Details")
     ],
     title="🖼️ Text-to-Image Search with CLIP",
-    description="Enter a text description to search for the most relevant image in the Flickr8k dataset. The app will download the dataset and pre-process images on startup.",
     allow_flagging="never"
 )
-# This starts the web server
 iface.launch()

 from tqdm import tqdm
 # --- Custom Modules ---
 import config
 from inference_model import CLIPModel
+# --- 1. Initial Setup: Load Model and Tokenizer ---
 print("Starting application setup...")
 device = config.DEVICE
     print("CLIP Model loaded successfully.")
 except Exception as e:
     print(f"Error loading model: {e}")
+    model = None
 # Load the text tokenizer
 tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
 print("Tokenizer loaded successfully.")
+# --- 2. Data Handling: Download and Pre-process Images ---
 DATASET_REPO_ID = "mustafa2ak/Flickr8k-Images"
 IMAGE_STORAGE_PATH = "./flickr8k_images"
 print(f"Downloading image dataset from {DATASET_REPO_ID}...")
 snapshot_download(
     repo_id=DATASET_REPO_ID,
     repo_type="dataset",
     local_dir=IMAGE_STORAGE_PATH,
+    local_dir_use_symlinks=False # Set to False for Spaces compatibility
 )
 print("Image dataset download complete.")
+# Get a list of all image file paths
 all_image_paths = glob.glob(os.path.join(IMAGE_STORAGE_PATH, "Flicker8k_Dataset", "*.jpg"))
+# **CRITICAL FIX FOR TIMEOUT**: Use a smaller subset of images for the demo.
+# Processing all 8000+ images on startup will cause a timeout on Hugging Face Spaces.
+NUM_IMAGES_TO_PROCESS = 1000
+all_image_paths = all_image_paths[:NUM_IMAGES_TO_PROCESS]
+print(f"Found {len(all_image_paths)} total images. Using a subset of {NUM_IMAGES_TO_PROCESS} to prevent timeout.")
 # Define the image preprocessing pipeline
 image_transform = transforms.Compose([
 ])
 def precompute_image_embeddings(image_paths, model, transform, device):
+    """Processes all images and computes their final embeddings for fast searching."""
+    print("Pre-computing image embeddings... This may take a minute.")
     all_embeddings = []
     with torch.no_grad():
         for path in tqdm(image_paths, desc="Processing Images"):
             try:
                 image = Image.open(path).convert("RGB")
                 image_tensor = transform(image).unsqueeze(0).to(device)
+                # **CORRECTION**: Use the full model's forward pass to get projected embeddings.
+                # This returns (image_embedding, text_embedding), so we take the first element.
+                embedding, _ = model(image_features=image_tensor)
                 all_embeddings.append(embedding)
             except Exception as e:
                 print(f"Warning: Could not process image {path}. Error: {e}")
                 continue
     return torch.cat(all_embeddings, dim=0)
 # Pre-compute all image embeddings and store them in memory
 if model and all_image_paths:
     image_embeddings_precomputed = precompute_image_embeddings(all_image_paths, model, image_transform, device)
+    # Normalize the embeddings once for faster similarity calculation
     image_embeddings_precomputed = F.normalize(image_embeddings_precomputed, p=2, dim=-1)
     print("Image embeddings pre-computed and stored.")
 else:
     image_embeddings_precomputed = None
     print("Skipping embedding pre-computation due to missing model or images.")
 # --- 3. The Main Gradio Function for Text-to-Image Search ---
 def find_image_from_text(text_query):
+    """Takes a text query and finds the best matching image."""
     if not text_query:
         return None, "Please enter a text query."
     if image_embeddings_precomputed is None:
     print(f"Searching for text: '{text_query}'")
     with torch.no_grad():
+        # 1. Process the text query
         text_inputs = tokenizer([text_query], padding=True, truncation=True, return_tensors="pt").to(device)
+        # 2. **CORRECTION**: Use the full model's forward pass to get projected text embedding.
+        # This returns (image_embedding, text_embedding), so we take the second element.
+        _, text_embedding = model(
+            text_input_ids=text_inputs['input_ids'],
             attention_mask=text_inputs['attention_mask']
         )
+        # 3. Normalize the text embedding
         text_embedding_norm = F.normalize(text_embedding, p=2, dim=-1)
+        # 4. Calculate similarity against all pre-computed image embeddings
         similarity_scores = (text_embedding_norm @ image_embeddings_precomputed.T).squeeze(0)
+        # 5. Find the index of the image with the highest score
         best_image_index = similarity_scores.argmax().item()
         best_image_path = all_image_paths[best_image_index]
         best_score = similarity_scores[best_image_index].item()
         print(f"Found best match: {best_image_path} with score {best_score:.4f}")
         return best_image_path, f"Best match with score: {best_score:.4f}"
 # --- 4. Create and Launch the Gradio Interface ---
 iface = gr.Interface(
     fn=find_image_from_text,
         gr.Textbox(label="Result Details")
     ],
     title="🖼️ Text-to-Image Search with CLIP",
+    description="Enter a text description to search for the most relevant image in the Flickr8k dataset. The app uses a pre-trained CLIP-like model to find the best match from a subset of 1000 images.",
     allow_flagging="never"
 )
 iface.launch()

inference_model.py CHANGED Viewed

@@ -1,32 +1,38 @@
-# inference_model.py
 import torch
 import torch.nn as nn
 from torchvision.models import resnet50
 from transformers import DistilBertModel
-# --- Copy these classes from your original file ---
 class VisionEncoder(nn.Module):
     def __init__(self):
         super().__init__()
-        # Note: Using the newer 'weights' parameter is recommended
         pretrained_resnet50 = resnet50(weights='IMAGENET1K_V1')
         self.model = nn.Sequential(*list(pretrained_resnet50.children())[:-1])
         for param in self.model.parameters():
             param.requires_grad = False
     def forward(self, x):
         x = self.model(x)
         return x.view(x.size(0), -1)
 class TextEncoder(nn.Module):
     def __init__(self):
         super().__init__()
         self.model = DistilBertModel.from_pretrained('distilbert-base-uncased')
         for param in self.model.parameters():
             param.requires_grad = False
     def forward(self, input_ids, attention_mask=None):
         outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
         return outputs.last_hidden_state[:, 0, :]
 class ProjectionHead(nn.Module):
@@ -43,31 +49,46 @@ class ProjectionHead(nn.Module):
         x = self.gelu(projected)
         x = self.fc(x)
         x = self.dropout(x)
         x = x + projected
         x = self.layer_norm(x)
         return x
-# --- This is the MODIFIED CLIPModel for inference ---
 class CLIPModel(nn.Module):
     def __init__(self, image_embedding_dim, text_embedding_dim, projection_dim):
         super().__init__()
-        self.vision_encoder = VisionEncoder()
         self.text_encoder = TextEncoder()
         self.image_projection = ProjectionHead(embedding_dim=image_embedding_dim, projection_dim=projection_dim)
         self.text_projection = ProjectionHead(embedding_dim=text_embedding_dim, projection_dim=projection_dim)
     def forward(self, image_features=None, text_input_ids=None, text_attention_mask=None):
         image_embedding = None
         if image_features is not None:
-            image_features = self.vision_encoder(image_features)
-            image_embedding = self.image_projection(image_features)
         text_embedding = None
         if text_input_ids is not None:
-            text_features = self.text_encoder(
                 input_ids=text_input_ids,
                 attention_mask=text_attention_mask
             )
-            text_embedding = self.text_projection(text_features)
         return image_embedding, text_embedding

 import torch
 import torch.nn as nn
 from torchvision.models import resnet50
 from transformers import DistilBertModel
+# --- Helper Classes (VisionEncoder, TextEncoder, ProjectionHead) ---
+# These define the components of the overall CLIP model.
 class VisionEncoder(nn.Module):
     def __init__(self):
         super().__init__()
+        # Use the recommended 'weights' parameter for pre-trained models
         pretrained_resnet50 = resnet50(weights='IMAGENET1K_V1')
+        # Use all layers of ResNet50 except for the final fully connected layer
         self.model = nn.Sequential(*list(pretrained_resnet50.children())[:-1])
+        # Freeze the parameters of the vision encoder
         for param in self.model.parameters():
             param.requires_grad = False
     def forward(self, x):
         x = self.model(x)
+        # Flatten the output to a 1D tensor per image
         return x.view(x.size(0), -1)
 class TextEncoder(nn.Module):
     def __init__(self):
         super().__init__()
         self.model = DistilBertModel.from_pretrained('distilbert-base-uncased')
+        # Freeze the parameters of the text encoder
         for param in self.model.parameters():
             param.requires_grad = False
     def forward(self, input_ids, attention_mask=None):
         outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
+        # Use the embedding of the [CLS] token as the sentence representation
         return outputs.last_hidden_state[:, 0, :]
 class ProjectionHead(nn.Module):
         x = self.gelu(projected)
         x = self.fc(x)
         x = self.dropout(x)
+        # Add a residual connection
         x = x + projected
         x = self.layer_norm(x)
         return x
+# --- Main CLIPModel for Inference ---
+# This class combines the encoders and projection heads.
 class CLIPModel(nn.Module):
     def __init__(self, image_embedding_dim, text_embedding_dim, projection_dim):
         super().__init__()
+        # **CORRECTION**: Renamed 'vision_encoder' to 'image_encoder'
+        # This attribute MUST be named 'image_encoder' to match the call in app.py
+        self.image_encoder = VisionEncoder()
         self.text_encoder = TextEncoder()
         self.image_projection = ProjectionHead(embedding_dim=image_embedding_dim, projection_dim=projection_dim)
         self.text_projection = ProjectionHead(embedding_dim=text_embedding_dim, projection_dim=projection_dim)
     def forward(self, image_features=None, text_input_ids=None, text_attention_mask=None):
+        """
+        This forward pass handles both image and text inputs.
+        app.py will call this to get the final, projected embeddings.
+        """
         image_embedding = None
         if image_features is not None:
+            # Get raw features from the vision backbone
+            image_features_raw = self.image_encoder(image_features)
+            # Project them into the shared embedding space
+            image_embedding = self.image_projection(image_features_raw)
         text_embedding = None
         if text_input_ids is not None:
+            # Get raw features from the text backbone
+            text_features_raw = self.text_encoder(
                 input_ids=text_input_ids,
                 attention_mask=text_attention_mask
             )
+            # Project them into the shared embedding space
+            text_embedding = self.text_projection(text_features_raw)
         return image_embedding, text_embedding