coda / cleanup_deleted_images.py
justinkay
Cleanup script
e7063f6
raw
history blame
2.99 kB
#!/usr/bin/env python3
"""
Script to clean up references to deleted images from:
- iwildcam_demo_annotations.json
- iwildcam_demo.pt
- iwildcam_demo_labels.pt
- images.txt
"""
import json
import os
import torch
# Get list of existing images
image_dir = "iwildcam_demo_images"
existing_images = set(os.listdir(image_dir))
print(f"Found {len(existing_images)} existing images")
# Read images.txt to get current order
with open("images.txt", "r") as f:
current_images = [line.strip() for line in f]
print(f"Found {len(current_images)} images in images.txt")
# Identify which images still exist and their new indices
valid_images = []
valid_indices = []
for idx, img in enumerate(current_images):
if img in existing_images:
valid_images.append(img)
valid_indices.append(idx)
print(f"Keeping {len(valid_images)} images")
print(f"Removing {len(current_images) - len(valid_images)} images")
# Update images.txt
with open("images.txt", "w") as f:
for img in valid_images:
f.write(f"{img}\n")
print("Updated images.txt")
# Load and filter .pt files
demo_tensors = torch.load("iwildcam_demo.pt")
demo_labels = torch.load("iwildcam_demo_labels.pt")
print(f"Original iwildcam_demo.pt shape: {demo_tensors.shape}")
print(f"Original iwildcam_demo_labels.pt shape: {demo_labels.shape}")
# Filter tensors to only keep valid indices
# demo_tensors has shape [3, N, 5] where N is number of images
# We need to filter along dimension 1
filtered_demo = demo_tensors[:, valid_indices, :]
filtered_labels = demo_labels[valid_indices]
# Save filtered tensors
torch.save(filtered_demo, "iwildcam_demo.pt")
torch.save(filtered_labels, "iwildcam_demo_labels.pt")
print(f"Updated iwildcam_demo.pt: {demo_tensors.shape} -> {filtered_demo.shape}")
print(f"Updated iwildcam_demo_labels.pt: {demo_labels.shape} -> {filtered_labels.shape}")
# Load and filter JSON annotations
with open("iwildcam_demo_annotations.json", "r") as f:
annotations = json.load(f)
# Filter images in JSON
if "images" in annotations:
original_count = len(annotations["images"])
annotations["images"] = [
img for img in annotations["images"]
if img["file_name"] in existing_images
]
print(f"Updated JSON images: {original_count} -> {len(annotations['images'])}")
# Filter annotations in JSON (if they reference image_id)
if "annotations" in annotations:
# Build mapping of file_name to image_id for existing images
valid_image_ids = {img["id"] for img in annotations["images"]}
original_count = len(annotations["annotations"])
annotations["annotations"] = [
ann for ann in annotations["annotations"]
if ann["image_id"] in valid_image_ids
]
print(f"Updated JSON annotations: {original_count} -> {len(annotations['annotations'])}")
# Save updated JSON
with open("iwildcam_demo_annotations.json", "w") as f:
json.dump(annotations, f, indent=2)
print("Updated iwildcam_demo_annotations.json")
print("\nCleanup complete!")