Spaces:

fumucu
/

LAOD

Running on Zero

App Files Files Community

fumucu commited on Jul 30

Commit

f0496c5

1 Parent(s): ba33898

initial commit

Browse files

Files changed (7) hide show

.gitattributes +2 -0
app.py +31 -0
draw_utils.py +112 -0
images/1.jpg +3 -0
images/2.jpg +0 -0
images/3.jpg +3 -0
laod_pipeline.py +74 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+images/1.jpg filter=lfs diff=lfs merge=lfs -text
+images/3.jpg filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import gradio as gr
+from laod_pipeline import laod_gdino
+examples =[['images/1.jpg'], ['images/2.jpg'], ['images/3.jpg']]
+title = "LAOD: LLM-Guided Agentic Object Detection for Open-World Understanding"
+# --- HTML/CSS for Centered Horizontal Buttons ---
+# We use a div with Flexbox to center the buttons and add a gap between them.
+description = """
+<div style="display: flex; justify-content: center; align-items: center; text-align: center; gap: 15px;">
+  <p style="margin: 0;">For more details:</p>
+  <a href="https://github.com/furkanmumcu/LAOD" target="_blank">
+    <img src="https://img.shields.io/badge/GitHub-Repo-blue?style=for-the-badge&logo=github" alt="GitHub Repo">
+  </a>
+  <a href="https://arxiv.org/abs/2507.10844" target="_blank">
+    <img src="https://img.shields.io/badge/arXiv-Paper-b31b1b?style=for-the-badge&logo=arxiv" alt="arXiv Paper">
+  </a>
+</div>
+"""
+demo = gr.Interface(
+    fn=laod_gdino,
+    inputs=gr.Image(label="Upload an Image", type="pil"),
+    outputs=gr.Image(label="Output"),
+    examples=examples,
+    title=title,
+    description=description
+)
+# To run the app, uncomment the line below
+demo.launch()

draw_utils.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import cv2
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+def visualize_detections(image_np, draw_results):
+    """
+    Visualizes bounding box detections on an image with improved styling.
+    Args:
+        image_np (np.array): The input image as a NumPy array (OpenCV format, BGR channel order).
+        draw_results (list): A list containing three arrays:
+                             [boxes_array, scores_array, labels_array].
+                             - boxes_array (list): List of bounding boxes, e.g., [[x1,y1,x2,y2], ...].
+                                                   Note: 'boxes' should be in [x_min, y_min, x_max, y_max] format.
+                             - scores_array (list): List of confidence scores, e.g., [s1, s2, ...].
+                             - labels_array (list): List of labels, e.g., ["label1", "label2", ...].
+    Returns:
+        np.array: The image with visualized detections, as a NumPy array (OpenCV format).
+    """
+    # Convert the OpenCV image (NumPy array, BGR) to a PIL Image (RGB) for text drawing.
+    # PIL offers better font rendering capabilities.
+    image_pil = Image.fromarray(cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB))
+    draw = ImageDraw.Draw(image_pil)
+    # Define a vibrant color palette for bounding boxes and text backgrounds.
+    # These colors will cycle through for different detections.
+    colors = [
+        (255, 99, 71),   # Tomato
+        (60, 179, 113),  # MediumSeaGreen
+        (65, 105, 225),  # RoyalBlue
+        (255, 215, 0),   # Gold
+        (186, 85, 211),  # MediumOrchid
+        (0, 206, 209),   # DarkTurquoise
+        (255, 140, 0),   # DarkOrange
+        (124, 252, 0),   # LawnGreen
+        (255, 105, 180), # HotPink
+        (75, 0, 130)     # Indigo
+    ]
+    # Try to load a common TrueType font (like Arial) for better text quality.
+    # Fallback to a default PIL font if 'arial.ttf' is not found.
+    try:
+        font = ImageFont.truetype("arial.ttf", 20) # Font size 20, adjust as needed
+    except IOError:
+        font = ImageFont.load_default()
+        print("Warning: Could not load 'arial.ttf'. Using default PIL font.")
+    # Unpack the boxes, scores, and labels directly from the draw_results list
+    # Assuming draw_results is always [boxes_array, scores_array, labels_array]
+    if len(draw_results) != 3:
+        print("Error: draw_results must contain exactly three arrays: boxes, scores, and labels.")
+        return image_np # Return original image if format is incorrect
+    boxes, scores, labels = draw_results
+    # Process each individual detection
+    for i, (box, score, label) in enumerate(zip(boxes, scores, labels)):
+        # Ensure box coordinates are integers for drawing
+        x, y, x2, y2 = [int(round(coord, 0)) for coord in box.tolist()]
+        score_item = round(score.item(), 3) # Round score for display
+        print(f"Detected {label} with confidence {score_item} at location {[x, y, x2, y2]}")
+        # Select a color from the palette, cycling through them
+        current_color = colors[i % len(colors)]
+        text_fill_color = (255, 255, 255) # White text for good contrast on colored backgrounds
+        # Draw the bounding box on the PIL image using ImageDraw
+        # This ensures the rectangle is drawn on the same image object as the text.
+        draw.rectangle([(x, y), (x2, y2)], outline=current_color, width=2) # Thickness 2
+        # Prepare the text string including label and score
+        display_text = f"{label} ({score_item:.2f})"
+        # Calculate text size using PIL's font to determine background rectangle dimensions
+        # Using textbbox for modern Pillow versions
+        # textbbox returns (left, top, right, bottom) of the text bounding box
+        left, top, right, bottom = draw.textbbox((0, 0), display_text, font=font)
+        text_width = right - left
+        text_height = bottom - top
+        # Determine text position to prevent overflow.
+        # Default position is slightly above the bounding box.
+        text_x = x
+        text_y = y - text_height - 5 # 5 pixels padding above text
+        # If the text would go above the image boundary (y < 0),
+        # place it just inside the top of the bounding box instead.
+        if text_y < 0:
+            text_y = y + 5 # 5 pixels padding below the top edge of the box
+        # Draw a filled rectangle as a background for the text for better readability.
+        # The background rectangle uses the same color as the bounding box.
+        # Add a small padding around the text.
+        bg_x1 = text_x
+        bg_y1 = text_y
+        bg_x2 = text_x + text_width + 8 # Add padding to width
+        bg_y2 = text_y + text_height + 8 # Add padding to height
+        # Ensure the background rectangle does not go beyond image boundaries
+        bg_x2 = min(bg_x2, image_pil.width)
+        bg_y2 = min(bg_y2, image_pil.height)
+        draw.rectangle([(bg_x1, bg_y1), (bg_x2, bg_y2)], fill=current_color)
+        # Draw the text on the PIL image
+        draw.text((text_x + 4, text_y + 4), display_text, font=font, fill=text_fill_color) # Add padding to text position
+    # Convert the modified PIL image (RGB) back to OpenCV format (BGR)
+    final_image_np = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)
+    return final_image_np

images/1.jpg ADDED Viewed

Git LFS Details

SHA256: dea9e7ef97386345f7cff32f9055da4982da5471c48d575146c796ab4563b04e
Pointer size: 131 Bytes
Size of remote file: 173 kB

images/2.jpg ADDED Viewed

images/3.jpg ADDED Viewed

Git LFS Details

SHA256: 691ee375b2a6ef3ef76e8c4d6da686a3898a6400921a7332e446e2b539f28a65
Pointer size: 131 Bytes
Size of remote file: 116 kB

laod_pipeline.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import torch
+from transformers import pipeline
+from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
+import numpy as np
+import draw_utils
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+pipe = pipeline(
+    "image-text-to-text",
+    model="google/gemma-3-4b-it",
+    #device="cuda:1",
+    device_map='auto',
+    torch_dtype=torch.bfloat16
+)
+model_id = "IDEA-Research/grounding-dino-tiny"
+processor = AutoProcessor.from_pretrained(model_id)
+model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)
+def laod_gdino(image):
+    messages = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": "Just Give the list of objects in given picture seperated by comma. Do not write anything else."}]},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "List the objects that you see in given picture."},
+                    {"type": "image", "url": image},
+                ]
+            },
+        ]
+    output = pipe(text=messages, max_new_tokens=500)
+    print(output[0]["generated_text"][-1]["content"])
+    llm_response = output[0]["generated_text"][-1]["content"]
+    llm_response = llm_response.lower()
+    llm_response = llm_response.replace('pedestrian', 'person')
+    llm_response = llm_response.replace('people', 'person')
+    llm_response = llm_response.replace('man', 'person')
+    llm_response = llm_response.replace('woman', 'person')
+    llm_labels = llm_response.replace(', ', ',').split(',')
+    print(llm_labels)
+    llm_labels = [llm_labels]
+    inputs = processor(images=image, text=llm_labels, return_tensors="pt").to(device)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    results = processor.post_process_grounded_object_detection(
+        outputs,
+        inputs.input_ids,
+        box_threshold=0.4,
+        text_threshold=0.3,
+        target_sizes=[image.size[::-1]]
+    )
+    result = results[0]
+    image = np.array(image)
+    draw_results = [result["boxes"], result["scores"], result["labels"]]
+    return draw_utils.visualize_detections(image, draw_results)