Spaces:
Running
on
Zero
Running
on
Zero
initial commit
Browse files- .gitattributes +2 -0
- app.py +31 -0
- draw_utils.py +112 -0
- images/1.jpg +3 -0
- images/2.jpg +0 -0
- images/3.jpg +3 -0
- laod_pipeline.py +74 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
images/1.jpg filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
images/3.jpg filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from laod_pipeline import laod_gdino
|
| 3 |
+
|
| 4 |
+
examples =[['images/1.jpg'], ['images/2.jpg'], ['images/3.jpg']]
|
| 5 |
+
title = "LAOD: LLM-Guided Agentic Object Detection for Open-World Understanding"
|
| 6 |
+
|
| 7 |
+
# --- HTML/CSS for Centered Horizontal Buttons ---
|
| 8 |
+
# We use a div with Flexbox to center the buttons and add a gap between them.
|
| 9 |
+
description = """
|
| 10 |
+
<div style="display: flex; justify-content: center; align-items: center; text-align: center; gap: 15px;">
|
| 11 |
+
<p style="margin: 0;">For more details:</p>
|
| 12 |
+
<a href="https://github.com/furkanmumcu/LAOD" target="_blank">
|
| 13 |
+
<img src="https://img.shields.io/badge/GitHub-Repo-blue?style=for-the-badge&logo=github" alt="GitHub Repo">
|
| 14 |
+
</a>
|
| 15 |
+
<a href="https://arxiv.org/abs/2507.10844" target="_blank">
|
| 16 |
+
<img src="https://img.shields.io/badge/arXiv-Paper-b31b1b?style=for-the-badge&logo=arxiv" alt="arXiv Paper">
|
| 17 |
+
</a>
|
| 18 |
+
</div>
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
demo = gr.Interface(
|
| 22 |
+
fn=laod_gdino,
|
| 23 |
+
inputs=gr.Image(label="Upload an Image", type="pil"),
|
| 24 |
+
outputs=gr.Image(label="Output"),
|
| 25 |
+
examples=examples,
|
| 26 |
+
title=title,
|
| 27 |
+
description=description
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
# To run the app, uncomment the line below
|
| 31 |
+
demo.launch()
|
draw_utils.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import cv2
|
| 2 |
+
import numpy as np
|
| 3 |
+
from PIL import Image, ImageDraw, ImageFont
|
| 4 |
+
|
| 5 |
+
def visualize_detections(image_np, draw_results):
|
| 6 |
+
"""
|
| 7 |
+
Visualizes bounding box detections on an image with improved styling.
|
| 8 |
+
|
| 9 |
+
Args:
|
| 10 |
+
image_np (np.array): The input image as a NumPy array (OpenCV format, BGR channel order).
|
| 11 |
+
draw_results (list): A list containing three arrays:
|
| 12 |
+
[boxes_array, scores_array, labels_array].
|
| 13 |
+
- boxes_array (list): List of bounding boxes, e.g., [[x1,y1,x2,y2], ...].
|
| 14 |
+
Note: 'boxes' should be in [x_min, y_min, x_max, y_max] format.
|
| 15 |
+
- scores_array (list): List of confidence scores, e.g., [s1, s2, ...].
|
| 16 |
+
- labels_array (list): List of labels, e.g., ["label1", "label2", ...].
|
| 17 |
+
Returns:
|
| 18 |
+
np.array: The image with visualized detections, as a NumPy array (OpenCV format).
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
# Convert the OpenCV image (NumPy array, BGR) to a PIL Image (RGB) for text drawing.
|
| 22 |
+
# PIL offers better font rendering capabilities.
|
| 23 |
+
image_pil = Image.fromarray(cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB))
|
| 24 |
+
draw = ImageDraw.Draw(image_pil)
|
| 25 |
+
|
| 26 |
+
# Define a vibrant color palette for bounding boxes and text backgrounds.
|
| 27 |
+
# These colors will cycle through for different detections.
|
| 28 |
+
colors = [
|
| 29 |
+
(255, 99, 71), # Tomato
|
| 30 |
+
(60, 179, 113), # MediumSeaGreen
|
| 31 |
+
(65, 105, 225), # RoyalBlue
|
| 32 |
+
(255, 215, 0), # Gold
|
| 33 |
+
(186, 85, 211), # MediumOrchid
|
| 34 |
+
(0, 206, 209), # DarkTurquoise
|
| 35 |
+
(255, 140, 0), # DarkOrange
|
| 36 |
+
(124, 252, 0), # LawnGreen
|
| 37 |
+
(255, 105, 180), # HotPink
|
| 38 |
+
(75, 0, 130) # Indigo
|
| 39 |
+
]
|
| 40 |
+
|
| 41 |
+
# Try to load a common TrueType font (like Arial) for better text quality.
|
| 42 |
+
# Fallback to a default PIL font if 'arial.ttf' is not found.
|
| 43 |
+
try:
|
| 44 |
+
font = ImageFont.truetype("arial.ttf", 20) # Font size 20, adjust as needed
|
| 45 |
+
except IOError:
|
| 46 |
+
font = ImageFont.load_default()
|
| 47 |
+
print("Warning: Could not load 'arial.ttf'. Using default PIL font.")
|
| 48 |
+
|
| 49 |
+
# Unpack the boxes, scores, and labels directly from the draw_results list
|
| 50 |
+
# Assuming draw_results is always [boxes_array, scores_array, labels_array]
|
| 51 |
+
if len(draw_results) != 3:
|
| 52 |
+
print("Error: draw_results must contain exactly three arrays: boxes, scores, and labels.")
|
| 53 |
+
return image_np # Return original image if format is incorrect
|
| 54 |
+
|
| 55 |
+
boxes, scores, labels = draw_results
|
| 56 |
+
|
| 57 |
+
# Process each individual detection
|
| 58 |
+
for i, (box, score, label) in enumerate(zip(boxes, scores, labels)):
|
| 59 |
+
# Ensure box coordinates are integers for drawing
|
| 60 |
+
x, y, x2, y2 = [int(round(coord, 0)) for coord in box.tolist()]
|
| 61 |
+
score_item = round(score.item(), 3) # Round score for display
|
| 62 |
+
|
| 63 |
+
print(f"Detected {label} with confidence {score_item} at location {[x, y, x2, y2]}")
|
| 64 |
+
|
| 65 |
+
# Select a color from the palette, cycling through them
|
| 66 |
+
current_color = colors[i % len(colors)]
|
| 67 |
+
text_fill_color = (255, 255, 255) # White text for good contrast on colored backgrounds
|
| 68 |
+
|
| 69 |
+
# Draw the bounding box on the PIL image using ImageDraw
|
| 70 |
+
# This ensures the rectangle is drawn on the same image object as the text.
|
| 71 |
+
draw.rectangle([(x, y), (x2, y2)], outline=current_color, width=2) # Thickness 2
|
| 72 |
+
|
| 73 |
+
# Prepare the text string including label and score
|
| 74 |
+
display_text = f"{label} ({score_item:.2f})"
|
| 75 |
+
|
| 76 |
+
# Calculate text size using PIL's font to determine background rectangle dimensions
|
| 77 |
+
# Using textbbox for modern Pillow versions
|
| 78 |
+
# textbbox returns (left, top, right, bottom) of the text bounding box
|
| 79 |
+
left, top, right, bottom = draw.textbbox((0, 0), display_text, font=font)
|
| 80 |
+
text_width = right - left
|
| 81 |
+
text_height = bottom - top
|
| 82 |
+
|
| 83 |
+
# Determine text position to prevent overflow.
|
| 84 |
+
# Default position is slightly above the bounding box.
|
| 85 |
+
text_x = x
|
| 86 |
+
text_y = y - text_height - 5 # 5 pixels padding above text
|
| 87 |
+
|
| 88 |
+
# If the text would go above the image boundary (y < 0),
|
| 89 |
+
# place it just inside the top of the bounding box instead.
|
| 90 |
+
if text_y < 0:
|
| 91 |
+
text_y = y + 5 # 5 pixels padding below the top edge of the box
|
| 92 |
+
|
| 93 |
+
# Draw a filled rectangle as a background for the text for better readability.
|
| 94 |
+
# The background rectangle uses the same color as the bounding box.
|
| 95 |
+
# Add a small padding around the text.
|
| 96 |
+
bg_x1 = text_x
|
| 97 |
+
bg_y1 = text_y
|
| 98 |
+
bg_x2 = text_x + text_width + 8 # Add padding to width
|
| 99 |
+
bg_y2 = text_y + text_height + 8 # Add padding to height
|
| 100 |
+
|
| 101 |
+
# Ensure the background rectangle does not go beyond image boundaries
|
| 102 |
+
bg_x2 = min(bg_x2, image_pil.width)
|
| 103 |
+
bg_y2 = min(bg_y2, image_pil.height)
|
| 104 |
+
|
| 105 |
+
draw.rectangle([(bg_x1, bg_y1), (bg_x2, bg_y2)], fill=current_color)
|
| 106 |
+
|
| 107 |
+
# Draw the text on the PIL image
|
| 108 |
+
draw.text((text_x + 4, text_y + 4), display_text, font=font, fill=text_fill_color) # Add padding to text position
|
| 109 |
+
|
| 110 |
+
# Convert the modified PIL image (RGB) back to OpenCV format (BGR)
|
| 111 |
+
final_image_np = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)
|
| 112 |
+
return final_image_np
|
images/1.jpg
ADDED
|
Git LFS Details
|
images/2.jpg
ADDED
|
images/3.jpg
ADDED
|
Git LFS Details
|
laod_pipeline.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from transformers import pipeline
|
| 3 |
+
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
|
| 4 |
+
import numpy as np
|
| 5 |
+
import draw_utils
|
| 6 |
+
|
| 7 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
pipe = pipeline(
|
| 11 |
+
"image-text-to-text",
|
| 12 |
+
model="google/gemma-3-4b-it",
|
| 13 |
+
#device="cuda:1",
|
| 14 |
+
device_map='auto',
|
| 15 |
+
torch_dtype=torch.bfloat16
|
| 16 |
+
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
model_id = "IDEA-Research/grounding-dino-tiny"
|
| 21 |
+
processor = AutoProcessor.from_pretrained(model_id)
|
| 22 |
+
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)
|
| 23 |
+
|
| 24 |
+
def laod_gdino(image):
|
| 25 |
+
messages = [
|
| 26 |
+
{
|
| 27 |
+
"role": "system",
|
| 28 |
+
"content": [{"type": "text", "text": "Just Give the list of objects in given picture seperated by comma. Do not write anything else."}]},
|
| 29 |
+
{
|
| 30 |
+
"role": "user",
|
| 31 |
+
"content": [
|
| 32 |
+
{"type": "text", "text": "List the objects that you see in given picture."},
|
| 33 |
+
{"type": "image", "url": image},
|
| 34 |
+
|
| 35 |
+
]
|
| 36 |
+
},
|
| 37 |
+
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
output = pipe(text=messages, max_new_tokens=500)
|
| 42 |
+
print(output[0]["generated_text"][-1]["content"])
|
| 43 |
+
|
| 44 |
+
llm_response = output[0]["generated_text"][-1]["content"]
|
| 45 |
+
|
| 46 |
+
llm_response = llm_response.lower()
|
| 47 |
+
llm_response = llm_response.replace('pedestrian', 'person')
|
| 48 |
+
llm_response = llm_response.replace('people', 'person')
|
| 49 |
+
llm_response = llm_response.replace('man', 'person')
|
| 50 |
+
llm_response = llm_response.replace('woman', 'person')
|
| 51 |
+
|
| 52 |
+
llm_labels = llm_response.replace(', ', ',').split(',')
|
| 53 |
+
|
| 54 |
+
print(llm_labels)
|
| 55 |
+
|
| 56 |
+
llm_labels = [llm_labels]
|
| 57 |
+
|
| 58 |
+
inputs = processor(images=image, text=llm_labels, return_tensors="pt").to(device)
|
| 59 |
+
with torch.no_grad():
|
| 60 |
+
outputs = model(**inputs)
|
| 61 |
+
|
| 62 |
+
results = processor.post_process_grounded_object_detection(
|
| 63 |
+
outputs,
|
| 64 |
+
inputs.input_ids,
|
| 65 |
+
box_threshold=0.4,
|
| 66 |
+
text_threshold=0.3,
|
| 67 |
+
target_sizes=[image.size[::-1]]
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
result = results[0]
|
| 71 |
+
image = np.array(image)
|
| 72 |
+
|
| 73 |
+
draw_results = [result["boxes"], result["scores"], result["labels"]]
|
| 74 |
+
return draw_utils.visualize_detections(image, draw_results)
|