| import openai, gradio as gr, json, plotly.graph_objects as go | |
| from pathlib import Path | |
| SYSTEM_PROMPT = """You are a multimodal assistant. Return one of the following response types: | |
| - Plain text (just a natural reply) | |
| - JSON object: {"type":"image","prompt":"<dalle-3 prompt>"} | |
| - JSON object: {"type":"chart","title":"<title>","data":[{"x":[...],"y":[...],"label":"..."}]} | |
| - JSON object: {"type":"table","headers":["A","B"],"rows":[[1,2],[3,4]]} | |
| - JSON object: {"type":"audio","text":"Text to speak"} | |
| Respond in plain text unless image/chart/table/audio is clearly required. | |
| """ | |
| def build_messages(history, user_msg): | |
| messages = [{"role": "system", "content": SYSTEM_PROMPT}] | |
| for u, a in history: | |
| messages.append({"role": "user", "content": u}) | |
| messages.append({"role": "assistant", "content": a}) | |
| messages.append({"role": "user", "content": user_msg}) | |
| return messages | |
| def multimodal_chat(api_key, user_msg, history): | |
| if not api_key: | |
| raise gr.Error("π Please provide your OpenAI API key.") | |
| openai.api_key = api_key | |
| messages = build_messages(history, user_msg) | |
| response = openai.chat.completions.create(model="gpt-4o", messages=messages) | |
| content = response.choices[0].message.content.strip() | |
| img_url, fig, table_html, audio_url = None, None, None, None | |
| try: | |
| parsed = json.loads(content) | |
| t = parsed.get("type") | |
| if t == "image": | |
| img = openai.images.generate(model="dall-e-3", prompt=parsed["prompt"], size="1024x1024", n=1) | |
| img_url = img.data[0].url | |
| history.append([user_msg, f""]) | |
| elif t == "chart": | |
| fig = go.Figure() | |
| for s in parsed["data"]: | |
| fig.add_trace(go.Scatter(x=s["x"], y=s["y"], mode="lines+markers", name=s.get("label", ""))) | |
| fig.update_layout(title=parsed["title"]) | |
| history.append([user_msg, parsed["title"]]) | |
| elif t == "table": | |
| headers = parsed["headers"] | |
| rows = parsed["rows"] | |
| table_html = f"<table><thead><tr>{''.join(f'<th>{h}</th>' for h in headers)}</tr></thead><tbody>" | |
| table_html += ''.join(f"<tr>{''.join(f'<td>{c}</td>' for c in row)}</tr>" for row in rows) | |
| table_html += "</tbody></table>" | |
| history.append([user_msg, "Table generated below"]) | |
| elif t == "audio": | |
| audio = openai.audio.speech.create(model="tts-1", voice="alloy", input=parsed["text"]) | |
| path = "/tmp/audio.mp3" | |
| with open(path, "wb") as f: f.write(audio.read()) | |
| audio_url = path | |
| history.append([user_msg, parsed["text"]]) | |
| else: | |
| history.append([user_msg, content]) | |
| except Exception: | |
| history.append([user_msg, content]) | |
| return history, img_url, fig, table_html, audio_url | |
| with gr.Blocks(css="style.css") as demo: | |
| gr.Markdown("π€ **Multimodal Assistant** β Text, Images, Charts, Tables, Audio", elem_id="zen-header") | |
| api_key = gr.Textbox(label="OpenAI API Key", type="password", placeholder="sk-...") | |
| chatbot = gr.Chatbot() | |
| with gr.Row(): | |
| user_msg = gr.Textbox(label="Your message", scale=4) | |
| send_btn = gr.Button("Send", variant="primary") | |
| img_out = gr.Image() | |
| chart_out = gr.Plot() | |
| table_out = gr.HTML() | |
| audio_out = gr.Audio(type="filepath") | |
| def respond(api_key, user_msg, chat_history): | |
| chat_history, img_url, fig, table, audio = multimodal_chat(api_key, user_msg, chat_history) | |
| return chat_history, gr.update(value=img_url), gr.update(value=fig), gr.update(value=table), gr.update(value=audio) | |
| send_btn.click(respond, [api_key, user_msg, chatbot], [chatbot, img_out, chart_out, table_out, audio_out]) | |
| user_msg.submit(respond, [api_key, user_msg, chatbot], [chatbot, img_out, chart_out, table_out, audio_out]) | |
| if __name__ == "__main__": | |
| demo.queue().launch() | |