Spaces:

ItzRoBeerT
/

GAIA_agent

Sleeping

App Files Files Community

ItzRoBeerT commited on May 17

Commit

86c6428

1 Parent(s): b4fc57a

Added describe image tool

Browse files

Files changed (1) hide show

tools.py +60 -0

tools.py CHANGED Viewed

@@ -1,5 +1,8 @@
 from smolagents import Tool, DuckDuckGoSearchTool, PythonInterpreterTool, VisitWebpageTool, WikipediaSearchTool
 import whisper
 class read_file(Tool):
 	name="read_file"
@@ -47,6 +50,62 @@ class transcribe_audio(Tool):
 			return f"Error transcribing audio: {str(e)}"
 def return_tools() -> list[Tool]:
 	"""
 	Returns a list of tools to be used by the agent.
@@ -54,6 +113,7 @@ def return_tools() -> list[Tool]:
 	return [
 		read_file(),
 		transcribe_audio(),
 		DuckDuckGoSearchTool(),
 		PythonInterpreterTool(),
 		VisitWebpageTool(),

 from smolagents import Tool, DuckDuckGoSearchTool, PythonInterpreterTool, VisitWebpageTool, WikipediaSearchTool
+from openai import OpenAI
 import whisper
+import base64
+import os
 class read_file(Tool):
 	name="read_file"
 			return f"Error transcribing audio: {str(e)}"
+def get_data_uri(image_path: str, base64_image: str):
+	_, file_extension = os.path.splitext(image_path)
+	file_extension = file_extension.lower().lstrip(".")
+	mime_type = f"image/{file_extension}"
+	data_uri = f"data:{mime_type};base64,{base64_image}"
+	return data_uri
+class describe_image(Tool):
+	name="describe_image"
+	description="Describe an image and return the description."
+	inputs={
+		"image_path": {
+			"type": "string",
+			"description": "The path to the image file to describe."
+		}
+	}
+	output_type = "string"
+	def forward(self, image_path: str) -> str:
+		api_key = os.getenv("OPENROUTER_API_KEY")
+		if not api_key:
+			raise ValueError("OpenAI API key not provided and OPENAI_API_KEY environment variable not set")
+		base_url = os.getenv("OPENROUTER_BASE_URL")
+		client = OpenAI(api_key=api_key, base_url=base_url)
+		try:
+			with open(image_path, 'rb') as image_file:
+				base64_image = base64.b64encode(image_file.read()).decode('utf-8')
+				data_uri = get_data_uri(image_path, base64_image)
+				response = client.chat.completions.create(
+					model="gpt-4o",
+					messages=[
+						{
+                            "role": "user",
+                            "content": [
+                                {"type": "text", "text": "Describe this image in detail. Include information about the main subject, setting, colors, and any notable elements."},
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": data_uri}
+                                }
+                            ]
+                        }
+					],
+					max_tokens=500
+				)
+				return response.choices[0].message.content
+		except Exception as e:
+			return f"Error describing image: {str(e)}"
 def return_tools() -> list[Tool]:
 	"""
 	Returns a list of tools to be used by the agent.
 	return [
 		read_file(),
 		transcribe_audio(),
+		describe_image(),
 		DuckDuckGoSearchTool(),
 		PythonInterpreterTool(),
 		VisitWebpageTool(),