ItzRoBeerT commited on
Commit
86c6428
·
1 Parent(s): b4fc57a

Added describe image tool

Browse files
Files changed (1) hide show
  1. tools.py +60 -0
tools.py CHANGED
@@ -1,5 +1,8 @@
1
  from smolagents import Tool, DuckDuckGoSearchTool, PythonInterpreterTool, VisitWebpageTool, WikipediaSearchTool
 
2
  import whisper
 
 
3
 
4
  class read_file(Tool):
5
  name="read_file"
@@ -47,6 +50,62 @@ class transcribe_audio(Tool):
47
  return f"Error transcribing audio: {str(e)}"
48
 
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  def return_tools() -> list[Tool]:
51
  """
52
  Returns a list of tools to be used by the agent.
@@ -54,6 +113,7 @@ def return_tools() -> list[Tool]:
54
  return [
55
  read_file(),
56
  transcribe_audio(),
 
57
  DuckDuckGoSearchTool(),
58
  PythonInterpreterTool(),
59
  VisitWebpageTool(),
 
1
  from smolagents import Tool, DuckDuckGoSearchTool, PythonInterpreterTool, VisitWebpageTool, WikipediaSearchTool
2
+ from openai import OpenAI
3
  import whisper
4
+ import base64
5
+ import os
6
 
7
  class read_file(Tool):
8
  name="read_file"
 
50
  return f"Error transcribing audio: {str(e)}"
51
 
52
 
53
+ def get_data_uri(image_path: str, base64_image: str):
54
+ _, file_extension = os.path.splitext(image_path)
55
+ file_extension = file_extension.lower().lstrip(".")
56
+ mime_type = f"image/{file_extension}"
57
+ data_uri = f"data:{mime_type};base64,{base64_image}"
58
+
59
+ return data_uri
60
+
61
+ class describe_image(Tool):
62
+ name="describe_image"
63
+ description="Describe an image and return the description."
64
+ inputs={
65
+ "image_path": {
66
+ "type": "string",
67
+ "description": "The path to the image file to describe."
68
+ }
69
+ }
70
+
71
+ output_type = "string"
72
+
73
+ def forward(self, image_path: str) -> str:
74
+ api_key = os.getenv("OPENROUTER_API_KEY")
75
+ if not api_key:
76
+ raise ValueError("OpenAI API key not provided and OPENAI_API_KEY environment variable not set")
77
+
78
+ base_url = os.getenv("OPENROUTER_BASE_URL")
79
+ client = OpenAI(api_key=api_key, base_url=base_url)
80
+
81
+ try:
82
+ with open(image_path, 'rb') as image_file:
83
+ base64_image = base64.b64encode(image_file.read()).decode('utf-8')
84
+
85
+ data_uri = get_data_uri(image_path, base64_image)
86
+
87
+ response = client.chat.completions.create(
88
+ model="gpt-4o",
89
+ messages=[
90
+ {
91
+ "role": "user",
92
+ "content": [
93
+ {"type": "text", "text": "Describe this image in detail. Include information about the main subject, setting, colors, and any notable elements."},
94
+ {
95
+ "type": "image_url",
96
+ "image_url": {"url": data_uri}
97
+ }
98
+ ]
99
+ }
100
+ ],
101
+ max_tokens=500
102
+ )
103
+
104
+ return response.choices[0].message.content
105
+ except Exception as e:
106
+ return f"Error describing image: {str(e)}"
107
+
108
+
109
  def return_tools() -> list[Tool]:
110
  """
111
  Returns a list of tools to be used by the agent.
 
113
  return [
114
  read_file(),
115
  transcribe_audio(),
116
+ describe_image(),
117
  DuckDuckGoSearchTool(),
118
  PythonInterpreterTool(),
119
  VisitWebpageTool(),