Spaces:

mustafa2ak
/

clip_

Runtime error

App Files Files Community

Mustafa Acikgoz commited on Jul 20

Commit

2422360

1 Parent(s): 8818841

Fix model argument name and update logic

Browse files

Files changed (2) hide show

app.py +7 -8
inference_model.py +5 -5

app.py CHANGED Viewed

@@ -24,7 +24,9 @@ model = CLIPModel(
     projection_dim=config.PROJECTION_DIM
 ).to(device)
-# Load your trained model weights (.pth file)
 try:
     model.load_state_dict(torch.load(config.MODEL_PATH, map_location=device))
     model.eval()
@@ -47,15 +49,14 @@ snapshot_download(
     repo_id=DATASET_REPO_ID,
     repo_type="dataset",
     local_dir=IMAGE_STORAGE_PATH,
-    local_dir_use_symlinks=False # Set to False for Spaces compatibility
 )
 print("Image dataset download complete.")
 # Get a list of all image file paths
 all_image_paths = glob.glob(os.path.join(IMAGE_STORAGE_PATH, "Flicker8k_Dataset", "*.jpg"))
-# **CRITICAL FIX FOR TIMEOUT**: Use a smaller subset of images for the demo.
-# Processing all 8000+ images on startup will cause a timeout on Hugging Face Spaces.
 NUM_IMAGES_TO_PROCESS = 1000
 all_image_paths = all_image_paths[:NUM_IMAGES_TO_PROCESS]
 print(f"Found {len(all_image_paths)} total images. Using a subset of {NUM_IMAGES_TO_PROCESS} to prevent timeout.")
@@ -77,8 +78,6 @@ def precompute_image_embeddings(image_paths, model, transform, device):
                 image = Image.open(path).convert("RGB")
                 image_tensor = transform(image).unsqueeze(0).to(device)
-                # **CORRECTION**: Use the full model's forward pass to get projected embeddings.
-                # This returns (image_embedding, text_embedding), so we take the first element.
                 embedding, _ = model(image_features=image_tensor)
                 all_embeddings.append(embedding)
@@ -111,8 +110,8 @@ def find_image_from_text(text_query):
         # 1. Process the text query
         text_inputs = tokenizer([text_query], padding=True, truncation=True, return_tensors="pt").to(device)
-        # 2. **CORRECTION**: Use the full model's forward pass to get projected text embedding.
-        # This returns (image_embedding, text_embedding), so we take the second element.
         _, text_embedding = model(
             text_input_ids=text_inputs['input_ids'],
             attention_mask=text_inputs['attention_mask']

     projection_dim=config.PROJECTION_DIM
 ).to(device)
+# --- CRITICAL STEP ---
+# The application will fail if it cannot find the file specified in config.MODEL_PATH.
+# Make sure "clip_book_model.pth" is in the same directory as this script.
 try:
     model.load_state_dict(torch.load(config.MODEL_PATH, map_location=device))
     model.eval()
     repo_id=DATASET_REPO_ID,
     repo_type="dataset",
     local_dir=IMAGE_STORAGE_PATH,
+    local_dir_use_symlinks=False
 )
 print("Image dataset download complete.")
 # Get a list of all image file paths
 all_image_paths = glob.glob(os.path.join(IMAGE_STORAGE_PATH, "Flicker8k_Dataset", "*.jpg"))
+# Use a smaller subset of images to prevent timeouts on public platforms.
 NUM_IMAGES_TO_PROCESS = 1000
 all_image_paths = all_image_paths[:NUM_IMAGES_TO_PROCESS]
 print(f"Found {len(all_image_paths)} total images. Using a subset of {NUM_IMAGES_TO_PROCESS} to prevent timeout.")
                 image = Image.open(path).convert("RGB")
                 image_tensor = transform(image).unsqueeze(0).to(device)
                 embedding, _ = model(image_features=image_tensor)
                 all_embeddings.append(embedding)
         # 1. Process the text query
         text_inputs = tokenizer([text_query], padding=True, truncation=True, return_tensors="pt").to(device)
+        # 2. Get the projected text embedding from the model.
+        # No change is needed here because inference_model.py was updated to expect 'attention_mask'.
         _, text_embedding = model(
             text_input_ids=text_inputs['input_ids'],
             attention_mask=text_inputs['attention_mask']

inference_model.py CHANGED Viewed

@@ -61,18 +61,18 @@ class CLIPModel(nn.Module):
     def __init__(self, image_embedding_dim, text_embedding_dim, projection_dim):
         super().__init__()
-        # **CORRECTION**: Renamed 'vision_encoder' to 'image_encoder'
-        # This attribute MUST be named 'image_encoder' to match the call in app.py
         self.image_encoder = VisionEncoder()
         self.text_encoder = TextEncoder()
         self.image_projection = ProjectionHead(embedding_dim=image_embedding_dim, projection_dim=projection_dim)
         self.text_projection = ProjectionHead(embedding_dim=text_embedding_dim, projection_dim=projection_dim)
-    def forward(self, image_features=None, text_input_ids=None, text_attention_mask=None):
         """
         This forward pass handles both image and text inputs.
         app.py will call this to get the final, projected embeddings.
         """
         image_embedding = None
         if image_features is not None:
@@ -86,7 +86,7 @@ class CLIPModel(nn.Module):
             # Get raw features from the text backbone
             text_features_raw = self.text_encoder(
                 input_ids=text_input_ids,
-                attention_mask=text_attention_mask
             )
             # Project them into the shared embedding space
             text_embedding = self.text_projection(text_features_raw)

     def __init__(self, image_embedding_dim, text_embedding_dim, projection_dim):
         super().__init__()
         self.image_encoder = VisionEncoder()
         self.text_encoder = TextEncoder()
         self.image_projection = ProjectionHead(embedding_dim=image_embedding_dim, projection_dim=projection_dim)
         self.text_projection = ProjectionHead(embedding_dim=text_embedding_dim, projection_dim=projection_dim)
+    def forward(self, image_features=None, text_input_ids=None, attention_mask=None):
         """
         This forward pass handles both image and text inputs.
         app.py will call this to get the final, projected embeddings.
+        **MODIFICATION**: Renamed 'text_attention_mask' to 'attention_mask' for
+        compatibility with the standard Hugging Face tokenizer output.
         """
         image_embedding = None
         if image_features is not None:
             # Get raw features from the text backbone
             text_features_raw = self.text_encoder(
                 input_ids=text_input_ids,
+                attention_mask=attention_mask
             )
             # Project them into the shared embedding space
             text_embedding = self.text_projection(text_features_raw)