fumucu commited on
Commit
f0496c5
·
1 Parent(s): ba33898

initial commit

Browse files
Files changed (7) hide show
  1. .gitattributes +2 -0
  2. app.py +31 -0
  3. draw_utils.py +112 -0
  4. images/1.jpg +3 -0
  5. images/2.jpg +0 -0
  6. images/3.jpg +3 -0
  7. laod_pipeline.py +74 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ images/1.jpg filter=lfs diff=lfs merge=lfs -text
37
+ images/3.jpg filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from laod_pipeline import laod_gdino
3
+
4
+ examples =[['images/1.jpg'], ['images/2.jpg'], ['images/3.jpg']]
5
+ title = "LAOD: LLM-Guided Agentic Object Detection for Open-World Understanding"
6
+
7
+ # --- HTML/CSS for Centered Horizontal Buttons ---
8
+ # We use a div with Flexbox to center the buttons and add a gap between them.
9
+ description = """
10
+ <div style="display: flex; justify-content: center; align-items: center; text-align: center; gap: 15px;">
11
+ <p style="margin: 0;">For more details:</p>
12
+ <a href="https://github.com/furkanmumcu/LAOD" target="_blank">
13
+ <img src="https://img.shields.io/badge/GitHub-Repo-blue?style=for-the-badge&logo=github" alt="GitHub Repo">
14
+ </a>
15
+ <a href="https://arxiv.org/abs/2507.10844" target="_blank">
16
+ <img src="https://img.shields.io/badge/arXiv-Paper-b31b1b?style=for-the-badge&logo=arxiv" alt="arXiv Paper">
17
+ </a>
18
+ </div>
19
+ """
20
+
21
+ demo = gr.Interface(
22
+ fn=laod_gdino,
23
+ inputs=gr.Image(label="Upload an Image", type="pil"),
24
+ outputs=gr.Image(label="Output"),
25
+ examples=examples,
26
+ title=title,
27
+ description=description
28
+ )
29
+
30
+ # To run the app, uncomment the line below
31
+ demo.launch()
draw_utils.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ from PIL import Image, ImageDraw, ImageFont
4
+
5
+ def visualize_detections(image_np, draw_results):
6
+ """
7
+ Visualizes bounding box detections on an image with improved styling.
8
+
9
+ Args:
10
+ image_np (np.array): The input image as a NumPy array (OpenCV format, BGR channel order).
11
+ draw_results (list): A list containing three arrays:
12
+ [boxes_array, scores_array, labels_array].
13
+ - boxes_array (list): List of bounding boxes, e.g., [[x1,y1,x2,y2], ...].
14
+ Note: 'boxes' should be in [x_min, y_min, x_max, y_max] format.
15
+ - scores_array (list): List of confidence scores, e.g., [s1, s2, ...].
16
+ - labels_array (list): List of labels, e.g., ["label1", "label2", ...].
17
+ Returns:
18
+ np.array: The image with visualized detections, as a NumPy array (OpenCV format).
19
+ """
20
+
21
+ # Convert the OpenCV image (NumPy array, BGR) to a PIL Image (RGB) for text drawing.
22
+ # PIL offers better font rendering capabilities.
23
+ image_pil = Image.fromarray(cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB))
24
+ draw = ImageDraw.Draw(image_pil)
25
+
26
+ # Define a vibrant color palette for bounding boxes and text backgrounds.
27
+ # These colors will cycle through for different detections.
28
+ colors = [
29
+ (255, 99, 71), # Tomato
30
+ (60, 179, 113), # MediumSeaGreen
31
+ (65, 105, 225), # RoyalBlue
32
+ (255, 215, 0), # Gold
33
+ (186, 85, 211), # MediumOrchid
34
+ (0, 206, 209), # DarkTurquoise
35
+ (255, 140, 0), # DarkOrange
36
+ (124, 252, 0), # LawnGreen
37
+ (255, 105, 180), # HotPink
38
+ (75, 0, 130) # Indigo
39
+ ]
40
+
41
+ # Try to load a common TrueType font (like Arial) for better text quality.
42
+ # Fallback to a default PIL font if 'arial.ttf' is not found.
43
+ try:
44
+ font = ImageFont.truetype("arial.ttf", 20) # Font size 20, adjust as needed
45
+ except IOError:
46
+ font = ImageFont.load_default()
47
+ print("Warning: Could not load 'arial.ttf'. Using default PIL font.")
48
+
49
+ # Unpack the boxes, scores, and labels directly from the draw_results list
50
+ # Assuming draw_results is always [boxes_array, scores_array, labels_array]
51
+ if len(draw_results) != 3:
52
+ print("Error: draw_results must contain exactly three arrays: boxes, scores, and labels.")
53
+ return image_np # Return original image if format is incorrect
54
+
55
+ boxes, scores, labels = draw_results
56
+
57
+ # Process each individual detection
58
+ for i, (box, score, label) in enumerate(zip(boxes, scores, labels)):
59
+ # Ensure box coordinates are integers for drawing
60
+ x, y, x2, y2 = [int(round(coord, 0)) for coord in box.tolist()]
61
+ score_item = round(score.item(), 3) # Round score for display
62
+
63
+ print(f"Detected {label} with confidence {score_item} at location {[x, y, x2, y2]}")
64
+
65
+ # Select a color from the palette, cycling through them
66
+ current_color = colors[i % len(colors)]
67
+ text_fill_color = (255, 255, 255) # White text for good contrast on colored backgrounds
68
+
69
+ # Draw the bounding box on the PIL image using ImageDraw
70
+ # This ensures the rectangle is drawn on the same image object as the text.
71
+ draw.rectangle([(x, y), (x2, y2)], outline=current_color, width=2) # Thickness 2
72
+
73
+ # Prepare the text string including label and score
74
+ display_text = f"{label} ({score_item:.2f})"
75
+
76
+ # Calculate text size using PIL's font to determine background rectangle dimensions
77
+ # Using textbbox for modern Pillow versions
78
+ # textbbox returns (left, top, right, bottom) of the text bounding box
79
+ left, top, right, bottom = draw.textbbox((0, 0), display_text, font=font)
80
+ text_width = right - left
81
+ text_height = bottom - top
82
+
83
+ # Determine text position to prevent overflow.
84
+ # Default position is slightly above the bounding box.
85
+ text_x = x
86
+ text_y = y - text_height - 5 # 5 pixels padding above text
87
+
88
+ # If the text would go above the image boundary (y < 0),
89
+ # place it just inside the top of the bounding box instead.
90
+ if text_y < 0:
91
+ text_y = y + 5 # 5 pixels padding below the top edge of the box
92
+
93
+ # Draw a filled rectangle as a background for the text for better readability.
94
+ # The background rectangle uses the same color as the bounding box.
95
+ # Add a small padding around the text.
96
+ bg_x1 = text_x
97
+ bg_y1 = text_y
98
+ bg_x2 = text_x + text_width + 8 # Add padding to width
99
+ bg_y2 = text_y + text_height + 8 # Add padding to height
100
+
101
+ # Ensure the background rectangle does not go beyond image boundaries
102
+ bg_x2 = min(bg_x2, image_pil.width)
103
+ bg_y2 = min(bg_y2, image_pil.height)
104
+
105
+ draw.rectangle([(bg_x1, bg_y1), (bg_x2, bg_y2)], fill=current_color)
106
+
107
+ # Draw the text on the PIL image
108
+ draw.text((text_x + 4, text_y + 4), display_text, font=font, fill=text_fill_color) # Add padding to text position
109
+
110
+ # Convert the modified PIL image (RGB) back to OpenCV format (BGR)
111
+ final_image_np = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)
112
+ return final_image_np
images/1.jpg ADDED

Git LFS Details

  • SHA256: dea9e7ef97386345f7cff32f9055da4982da5471c48d575146c796ab4563b04e
  • Pointer size: 131 Bytes
  • Size of remote file: 173 kB
images/2.jpg ADDED
images/3.jpg ADDED

Git LFS Details

  • SHA256: 691ee375b2a6ef3ef76e8c4d6da686a3898a6400921a7332e446e2b539f28a65
  • Pointer size: 131 Bytes
  • Size of remote file: 116 kB
laod_pipeline.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import pipeline
3
+ from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
4
+ import numpy as np
5
+ import draw_utils
6
+
7
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
8
+
9
+
10
+ pipe = pipeline(
11
+ "image-text-to-text",
12
+ model="google/gemma-3-4b-it",
13
+ #device="cuda:1",
14
+ device_map='auto',
15
+ torch_dtype=torch.bfloat16
16
+
17
+ )
18
+
19
+
20
+ model_id = "IDEA-Research/grounding-dino-tiny"
21
+ processor = AutoProcessor.from_pretrained(model_id)
22
+ model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)
23
+
24
+ def laod_gdino(image):
25
+ messages = [
26
+ {
27
+ "role": "system",
28
+ "content": [{"type": "text", "text": "Just Give the list of objects in given picture seperated by comma. Do not write anything else."}]},
29
+ {
30
+ "role": "user",
31
+ "content": [
32
+ {"type": "text", "text": "List the objects that you see in given picture."},
33
+ {"type": "image", "url": image},
34
+
35
+ ]
36
+ },
37
+
38
+ ]
39
+
40
+
41
+ output = pipe(text=messages, max_new_tokens=500)
42
+ print(output[0]["generated_text"][-1]["content"])
43
+
44
+ llm_response = output[0]["generated_text"][-1]["content"]
45
+
46
+ llm_response = llm_response.lower()
47
+ llm_response = llm_response.replace('pedestrian', 'person')
48
+ llm_response = llm_response.replace('people', 'person')
49
+ llm_response = llm_response.replace('man', 'person')
50
+ llm_response = llm_response.replace('woman', 'person')
51
+
52
+ llm_labels = llm_response.replace(', ', ',').split(',')
53
+
54
+ print(llm_labels)
55
+
56
+ llm_labels = [llm_labels]
57
+
58
+ inputs = processor(images=image, text=llm_labels, return_tensors="pt").to(device)
59
+ with torch.no_grad():
60
+ outputs = model(**inputs)
61
+
62
+ results = processor.post_process_grounded_object_detection(
63
+ outputs,
64
+ inputs.input_ids,
65
+ box_threshold=0.4,
66
+ text_threshold=0.3,
67
+ target_sizes=[image.size[::-1]]
68
+ )
69
+
70
+ result = results[0]
71
+ image = np.array(image)
72
+
73
+ draw_results = [result["boxes"], result["scores"], result["labels"]]
74
+ return draw_utils.visualize_detections(image, draw_results)